OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_codestream_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_codestream_avx2.cpp
34// Author: Aous Naman
35// Date: 15 May 2022
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <immintrin.h>
43#include "ojph_defs.h"
44#include "ojph_arch.h"
45
46namespace ojph {
47 namespace local {
48
51 {
52 __m128i x0 = _mm_loadu_si128((__m128i*)address);
53 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
54 x0 = _mm_or_si128(x0, x1);
55 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
56 x0 = _mm_or_si128(x0, x1);
57 x1 = _mm_shuffle_epi32(x0, 0x55); // x1 = x0[1,1,1,1]
58 x0 = _mm_or_si128(x0, x1);
59 ui32 t = (ui32)_mm_extract_epi32(x0, 0);
60 return t;
61 }
62
65 {
66 __m128i x0 = _mm_loadu_si128((__m128i*)address);
67 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
68 x0 = _mm_or_si128(x0, x1);
69 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
70 x0 = _mm_or_si128(x0, x1);
71 ui64 t;
72#ifdef OJPH_ARCH_X86_64
73 t = (ui64)_mm_extract_epi64(x0, 0);
74#elif (defined OJPH_ARCH_I386)
75 t = (ui64)(ui32)_mm_extract_epi32(x0, 0);
76 t |= (ui64)(ui32)_mm_extract_epi32(x0, 1) << 32;
77#else
78 #error Error unsupport compiler
79#endif
80 return t;
81 }
82
84 void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
85 float delta_inv, ui32 count, ui32* max_val)
86 {
87 ojph_unused(delta_inv);
88
89 // convert to sign and magnitude and keep max_val
90 ui32 shift = 31 - K_max;
91 __m256i m0 = _mm256_set1_epi32(INT_MIN);
92 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
93 __m256i *p = (__m256i*)sp;
94 for ( ; count >= 8; count -= 8, p += 1, dp += 8)
95 {
96 __m256i v = _mm256_loadu_si256(p);
97 __m256i sign = _mm256_and_si256(v, m0);
98 __m256i val = _mm256_abs_epi32(v);
99 val = _mm256_slli_epi32(val, (int)shift);
100 tmax = _mm256_or_si256(tmax, val);
101 val = _mm256_or_si256(val, sign);
102 _mm256_storeu_si256((__m256i*)dp, val);
103 }
104 if (count)
105 {
106 __m256i v = _mm256_loadu_si256(p);
107 __m256i sign = _mm256_and_si256(v, m0);
108 __m256i val = _mm256_abs_epi32(v);
109 val = _mm256_slli_epi32(val, (int)shift);
110
111 __m256i c = _mm256_set1_epi32((si32)count);
112 __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
113 __m256i mask = _mm256_cmpgt_epi32(c, idx);
114 c = _mm256_and_si256(val, mask);
115 tmax = _mm256_or_si256(tmax, c);
116
117 val = _mm256_or_si256(val, sign);
118 _mm256_storeu_si256((__m256i*)dp, val);
119 }
120 _mm256_storeu_si256((__m256i*)max_val, tmax);
121 }
122
124 void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
125 float delta_inv, ui32 count, ui32* max_val)
126 {
127 ojph_unused(K_max);
128
129 //quantize and convert to sign and magnitude and keep max_val
130 __m256 d = _mm256_set1_ps(delta_inv);
131 __m256i m0 = _mm256_set1_epi32(INT_MIN);
132 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
133 float *p = (float*)sp;
134
135 for ( ; count >= 8; count -= 8, p += 8, dp += 8)
136 {
137 __m256 vf = _mm256_loadu_ps(p);
138 vf = _mm256_mul_ps(vf, d); // multiply
139 __m256i val = _mm256_cvtps_epi32(vf); // convert to int
140 __m256i sign = _mm256_and_si256(val, m0); // get sign
141 val = _mm256_abs_epi32(val);
142 tmax = _mm256_or_si256(tmax, val);
143 val = _mm256_or_si256(val, sign);
144 _mm256_storeu_si256((__m256i*)dp, val);
145 }
146 if (count)
147 {
148 __m256 vf = _mm256_loadu_ps(p);
149 vf = _mm256_mul_ps(vf, d); // multiply
150 __m256i val = _mm256_cvtps_epi32(vf); // convert to int
151 __m256i sign = _mm256_and_si256(val, m0); // get sign
152 val = _mm256_abs_epi32(val);
153
154 __m256i c = _mm256_set1_epi32((si32)count);
155 __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
156 __m256i mask = _mm256_cmpgt_epi32(c, idx);
157 c = _mm256_and_si256(val, mask);
158 tmax = _mm256_or_si256(tmax, c);
159
160 val = _mm256_or_si256(val, sign);
161 _mm256_storeu_si256((__m256i*)dp, val);
162 }
163 _mm256_storeu_si256((__m256i*)max_val, tmax);
164 }
165
167 void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
168 float delta, ui32 count)
169 {
170 ojph_unused(delta);
171 ui32 shift = 31 - K_max;
172 __m256i m1 = _mm256_set1_epi32(INT_MAX);
173 si32 *p = (si32*)dp;
174 for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
175 {
176 __m256i v = _mm256_load_si256((__m256i*)sp);
177 __m256i val = _mm256_and_si256(v, m1);
178 val = _mm256_srli_epi32(val, (int)shift);
179 val = _mm256_sign_epi32(val, v);
180 _mm256_storeu_si256((__m256i*)p, val);
181 }
182 }
183
185 void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
186 float delta, ui32 count)
187 {
188 ojph_unused(K_max);
189 __m256i m1 = _mm256_set1_epi32(INT_MAX);
190 __m256 d = _mm256_set1_ps(delta);
191 float *p = (float*)dp;
192 for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
193 {
194 __m256i v = _mm256_load_si256((__m256i*)sp);
195 __m256i vali = _mm256_and_si256(v, m1);
196 __m256 valf = _mm256_cvtepi32_ps(vali);
197 valf = _mm256_mul_ps(valf, d);
198 __m256i sign = _mm256_andnot_si256(m1, v);
199 valf = _mm256_or_ps(valf, _mm256_castsi256_ps(sign));
200 _mm256_storeu_ps(p, valf);
201 }
202 }
203
205 void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
206 float delta_inv, ui32 count, ui64* max_val)
207 {
208 ojph_unused(delta_inv);
209
210 // convert to sign and magnitude and keep max_val
211 ui32 shift = 63 - K_max;
212 __m256i m0 = _mm256_set1_epi64x(LLONG_MIN);
213 __m256i zero = _mm256_setzero_si256();
214 __m256i one = _mm256_set1_epi64x(1);
215 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
216 __m256i *p = (__m256i*)sp;
217 for ( ; count >= 4; count -= 4, p += 1, dp += 4)
218 {
219 __m256i v = _mm256_loadu_si256(p);
220 __m256i sign = _mm256_cmpgt_epi64(zero, v);
221 __m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
222 __m256i ones = _mm256_and_si256(sign, one);
223 val = _mm256_add_epi64(val, ones); // 2's complement
224 sign = _mm256_and_si256(sign, m0);
225 val = _mm256_slli_epi64(val, (int)shift);
226 tmax = _mm256_or_si256(tmax, val);
227 val = _mm256_or_si256(val, sign);
228 _mm256_storeu_si256((__m256i*)dp, val);
229 }
230 if (count)
231 {
232 __m256i v = _mm256_loadu_si256(p);
233 __m256i sign = _mm256_cmpgt_epi64(zero, v);
234 __m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
235 __m256i ones = _mm256_and_si256(sign, one);
236 val = _mm256_add_epi64(val, ones); // 2's complement
237 sign = _mm256_and_si256(sign, m0);
238 val = _mm256_slli_epi64(val, (int)shift);
239
240 __m256i c = _mm256_set1_epi64x(count);
241 __m256i idx = _mm256_set_epi64x(3, 2, 1, 0);
242 __m256i mask = _mm256_cmpgt_epi64(c, idx);
243 c = _mm256_and_si256(val, mask);
244 tmax = _mm256_or_si256(tmax, c);
245
246 val = _mm256_or_si256(val, sign);
247 _mm256_storeu_si256((__m256i*)dp, val);
248 }
249 _mm256_storeu_si256((__m256i*)max_val, tmax);
250 }
251
253 void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
254 float delta, ui32 count)
255 {
256 ojph_unused(delta);
257
258 ui32 shift = 63 - K_max;
259 __m256i m1 = _mm256_set1_epi64x(LLONG_MAX);
260 __m256i zero = _mm256_setzero_si256();
261 __m256i one = _mm256_set1_epi64x(1);
262 si64 *p = (si64*)dp;
263 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
264 {
265 __m256i v = _mm256_load_si256((__m256i*)sp);
266 __m256i val = _mm256_and_si256(v, m1);
267 val = _mm256_srli_epi64(val, (int)shift);
268 __m256i sign = _mm256_cmpgt_epi64(zero, v);
269 val = _mm256_xor_si256(val, sign); // negate 1's complement
270 __m256i ones = _mm256_and_si256(sign, one);
271 val = _mm256_add_epi64(val, ones); // 2's complement
272 _mm256_storeu_si256((__m256i*)p, val);
273 }
274 }
275 }
276}
277
278#endif
ui64 avx2_find_max_val64(ui64 *address)
void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui32 avx2_find_max_val32(ui32 *address)
void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
#define ojph_unused(x)
Definition ojph_defs.h:78