39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
52 __m128i x0 = _mm_loadu_si128((__m128i*)address);
53 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
54 x0 = _mm_or_si128(x0, x1);
55 x1 = _mm_shuffle_epi32(x0, 0xEE);
56 x0 = _mm_or_si128(x0, x1);
57 x1 = _mm_shuffle_epi32(x0, 0x55);
58 x0 = _mm_or_si128(x0, x1);
59 ui32 t = (
ui32)_mm_extract_epi32(x0, 0);
66 __m128i x0 = _mm_loadu_si128((__m128i*)address);
67 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
68 x0 = _mm_or_si128(x0, x1);
69 x1 = _mm_shuffle_epi32(x0, 0xEE);
70 x0 = _mm_or_si128(x0, x1);
72#ifdef OJPH_ARCH_X86_64
73 t = (
ui64)_mm_extract_epi64(x0, 0);
74#elif (defined OJPH_ARCH_I386)
75 t = (
ui64)(
ui32)_mm_extract_epi32(x0, 0);
76 t |= (
ui64)(
ui32)_mm_extract_epi32(x0, 1) << 32;
78 #error Error unsupport compiler
85 float delta_inv,
ui32 count,
ui32* max_val)
90 ui32 shift = 31 - K_max;
91 __m256i m0 = _mm256_set1_epi32(INT_MIN);
92 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
93 __m256i *p = (__m256i*)sp;
94 for ( ; count >= 8; count -= 8, p += 1, dp += 8)
96 __m256i v = _mm256_loadu_si256(p);
97 __m256i sign = _mm256_and_si256(v, m0);
98 __m256i val = _mm256_abs_epi32(v);
99 val = _mm256_slli_epi32(val, (
int)shift);
100 tmax = _mm256_or_si256(tmax, val);
101 val = _mm256_or_si256(val, sign);
102 _mm256_storeu_si256((__m256i*)dp, val);
106 __m256i v = _mm256_loadu_si256(p);
107 __m256i sign = _mm256_and_si256(v, m0);
108 __m256i val = _mm256_abs_epi32(v);
109 val = _mm256_slli_epi32(val, (
int)shift);
111 __m256i c = _mm256_set1_epi32((
si32)count);
112 __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
113 __m256i mask = _mm256_cmpgt_epi32(c, idx);
114 c = _mm256_and_si256(val, mask);
115 tmax = _mm256_or_si256(tmax, c);
117 val = _mm256_or_si256(val, sign);
118 _mm256_storeu_si256((__m256i*)dp, val);
120 _mm256_storeu_si256((__m256i*)max_val, tmax);
125 float delta_inv,
ui32 count,
ui32* max_val)
130 __m256 d = _mm256_set1_ps(delta_inv);
131 __m256i m0 = _mm256_set1_epi32(INT_MIN);
132 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
133 float *p = (
float*)sp;
135 for ( ; count >= 8; count -= 8, p += 8, dp += 8)
137 __m256 vf = _mm256_loadu_ps(p);
138 vf = _mm256_mul_ps(vf, d);
139 __m256i val = _mm256_cvtps_epi32(vf);
140 __m256i sign = _mm256_and_si256(val, m0);
141 val = _mm256_abs_epi32(val);
142 tmax = _mm256_or_si256(tmax, val);
143 val = _mm256_or_si256(val, sign);
144 _mm256_storeu_si256((__m256i*)dp, val);
148 __m256 vf = _mm256_loadu_ps(p);
149 vf = _mm256_mul_ps(vf, d);
150 __m256i val = _mm256_cvtps_epi32(vf);
151 __m256i sign = _mm256_and_si256(val, m0);
152 val = _mm256_abs_epi32(val);
154 __m256i c = _mm256_set1_epi32((
si32)count);
155 __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
156 __m256i mask = _mm256_cmpgt_epi32(c, idx);
157 c = _mm256_and_si256(val, mask);
158 tmax = _mm256_or_si256(tmax, c);
160 val = _mm256_or_si256(val, sign);
161 _mm256_storeu_si256((__m256i*)dp, val);
163 _mm256_storeu_si256((__m256i*)max_val, tmax);
168 float delta,
ui32 count)
171 ui32 shift = 31 - K_max;
172 __m256i m1 = _mm256_set1_epi32(INT_MAX);
174 for (
ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
176 __m256i v = _mm256_load_si256((__m256i*)sp);
177 __m256i val = _mm256_and_si256(v, m1);
178 val = _mm256_srli_epi32(val, (
int)shift);
179 val = _mm256_sign_epi32(val, v);
180 _mm256_storeu_si256((__m256i*)p, val);
186 float delta,
ui32 count)
189 __m256i m1 = _mm256_set1_epi32(INT_MAX);
190 __m256 d = _mm256_set1_ps(delta);
191 float *p = (
float*)dp;
192 for (
ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
194 __m256i v = _mm256_load_si256((__m256i*)sp);
195 __m256i vali = _mm256_and_si256(v, m1);
196 __m256 valf = _mm256_cvtepi32_ps(vali);
197 valf = _mm256_mul_ps(valf, d);
198 __m256i sign = _mm256_andnot_si256(m1, v);
199 valf = _mm256_or_ps(valf, _mm256_castsi256_ps(sign));
200 _mm256_storeu_ps(p, valf);
206 float delta_inv,
ui32 count,
ui64* max_val)
211 ui32 shift = 63 - K_max;
212 __m256i m0 = _mm256_set1_epi64x(LLONG_MIN);
213 __m256i zero = _mm256_setzero_si256();
214 __m256i one = _mm256_set1_epi64x(1);
215 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
216 __m256i *p = (__m256i*)sp;
217 for ( ; count >= 4; count -= 4, p += 1, dp += 4)
219 __m256i v = _mm256_loadu_si256(p);
220 __m256i sign = _mm256_cmpgt_epi64(zero, v);
221 __m256i val = _mm256_xor_si256(v, sign);
222 __m256i ones = _mm256_and_si256(sign, one);
223 val = _mm256_add_epi64(val, ones);
224 sign = _mm256_and_si256(sign, m0);
225 val = _mm256_slli_epi64(val, (
int)shift);
226 tmax = _mm256_or_si256(tmax, val);
227 val = _mm256_or_si256(val, sign);
228 _mm256_storeu_si256((__m256i*)dp, val);
232 __m256i v = _mm256_loadu_si256(p);
233 __m256i sign = _mm256_cmpgt_epi64(zero, v);
234 __m256i val = _mm256_xor_si256(v, sign);
235 __m256i ones = _mm256_and_si256(sign, one);
236 val = _mm256_add_epi64(val, ones);
237 sign = _mm256_and_si256(sign, m0);
238 val = _mm256_slli_epi64(val, (
int)shift);
240 __m256i c = _mm256_set1_epi64x(count);
241 __m256i idx = _mm256_set_epi64x(3, 2, 1, 0);
242 __m256i mask = _mm256_cmpgt_epi64(c, idx);
243 c = _mm256_and_si256(val, mask);
244 tmax = _mm256_or_si256(tmax, c);
246 val = _mm256_or_si256(val, sign);
247 _mm256_storeu_si256((__m256i*)dp, val);
249 _mm256_storeu_si256((__m256i*)max_val, tmax);
254 float delta,
ui32 count)
258 ui32 shift = 63 - K_max;
259 __m256i m1 = _mm256_set1_epi64x(LLONG_MAX);
260 __m256i zero = _mm256_setzero_si256();
261 __m256i one = _mm256_set1_epi64x(1);
263 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
265 __m256i v = _mm256_load_si256((__m256i*)sp);
266 __m256i val = _mm256_and_si256(v, m1);
267 val = _mm256_srli_epi64(val, (
int)shift);
268 __m256i sign = _mm256_cmpgt_epi64(zero, v);
269 val = _mm256_xor_si256(val, sign);
270 __m256i ones = _mm256_and_si256(sign, one);
271 val = _mm256_add_epi64(val, ones);
272 _mm256_storeu_si256((__m256i*)p, val);
ui64 avx2_find_max_val64(ui64 *address)
void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui32 avx2_find_max_val32(ui32 *address)
void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)