39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
51 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
52 x1 = _mm_shuffle_epi32(x0, 0xEE);
53 x0 = _mm_or_si128(x0, x1);
54 x1 = _mm_shuffle_epi32(x0, 0x55);
55 x0 = _mm_or_si128(x0, x1);
56 _mm_storeu_si128((__m128i*)address, x0);
68 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
69 x1 = _mm_shuffle_epi32(x0, 0xEE);
70 x0 = _mm_or_si128(x0, x1);
71 _mm_storeu_si128((__m128i*)address, x0);
82 float delta_inv,
ui32 count,
ui32* max_val)
87 ui32 shift = 31 - K_max;
88 __m128i m0 = _mm_set1_epi32(INT_MIN);
89 __m128i zero = _mm_setzero_si128();
90 __m128i one = _mm_set1_epi32(1);
91 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
92 __m128i *p = (__m128i*)sp;
93 for ( ; count >= 4; count -= 4, p += 1, dp += 4)
95 __m128i v = _mm_loadu_si128(p);
96 __m128i sign = _mm_cmplt_epi32(v, zero);
97 __m128i val = _mm_xor_si128(v, sign);
98 __m128i ones = _mm_and_si128(sign, one);
99 val = _mm_add_epi32(val, ones);
100 sign = _mm_and_si128(sign, m0);
101 val = _mm_slli_epi32(val, (
int)shift);
102 tmax = _mm_or_si128(tmax, val);
103 val = _mm_or_si128(val, sign);
104 _mm_storeu_si128((__m128i*)dp, val);
108 __m128i v = _mm_loadu_si128(p);
109 __m128i sign = _mm_cmplt_epi32(v, zero);
110 __m128i val = _mm_xor_si128(v, sign);
111 __m128i ones = _mm_and_si128(sign, one);
112 val = _mm_add_epi32(val, ones);
113 sign = _mm_and_si128(sign, m0);
114 val = _mm_slli_epi32(val, (
int)shift);
116 __m128i c = _mm_set1_epi32((
si32)count);
117 __m128i idx = _mm_set_epi32(3, 2, 1, 0);
118 __m128i mask = _mm_cmpgt_epi32(c, idx);
119 c = _mm_and_si128(val, mask);
120 tmax = _mm_or_si128(tmax, c);
122 val = _mm_or_si128(val, sign);
123 _mm_storeu_si128((__m128i*)dp, val);
125 _mm_storeu_si128((__m128i*)max_val, tmax);
130 float delta_inv,
ui32 count,
ui32* max_val)
136 __m128 d = _mm_set1_ps(delta_inv);
137 __m128i zero = _mm_setzero_si128();
138 __m128i one = _mm_set1_epi32(1);
139 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
140 float *p = (
float*)sp;
141 for ( ; count >= 4; count -= 4, p += 4, dp += 4)
143 __m128 vf = _mm_loadu_ps(p);
144 vf = _mm_mul_ps(vf, d);
145 __m128i val = _mm_cvtps_epi32(vf);
146 __m128i sign = _mm_cmplt_epi32(val, zero);
147 val = _mm_xor_si128(val, sign);
148 __m128i ones = _mm_and_si128(sign, one);
149 val = _mm_add_epi32(val, ones);
150 tmax = _mm_or_si128(tmax, val);
151 sign = _mm_slli_epi32(sign, 31);
152 val = _mm_or_si128(val, sign);
153 _mm_storeu_si128((__m128i*)dp, val);
157 __m128 vf = _mm_loadu_ps(p);
158 vf = _mm_mul_ps(vf, d);
159 __m128i val = _mm_cvtps_epi32(vf);
160 __m128i sign = _mm_cmplt_epi32(val, zero);
161 val = _mm_xor_si128(val, sign);
162 __m128i ones = _mm_and_si128(sign, one);
163 val = _mm_add_epi32(val, ones);
165 __m128i c = _mm_set1_epi32((
si32)count);
166 __m128i idx = _mm_set_epi32(3, 2, 1, 0);
167 __m128i mask = _mm_cmpgt_epi32(c, idx);
168 c = _mm_and_si128(val, mask);
169 tmax = _mm_or_si128(tmax, c);
171 sign = _mm_slli_epi32(sign, 31);
172 val = _mm_or_si128(val, sign);
173 _mm_storeu_si128((__m128i*)dp, val);
175 _mm_storeu_si128((__m128i*)max_val, tmax);
180 float delta,
ui32 count)
183 ui32 shift = 31 - K_max;
184 __m128i m1 = _mm_set1_epi32(INT_MAX);
185 __m128i zero = _mm_setzero_si128();
186 __m128i one = _mm_set1_epi32(1);
188 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
190 __m128i v = _mm_load_si128((__m128i*)sp);
191 __m128i val = _mm_and_si128(v, m1);
192 val = _mm_srli_epi32(val, (
int)shift);
193 __m128i sign = _mm_cmplt_epi32(v, zero);
194 val = _mm_xor_si128(val, sign);
195 __m128i ones = _mm_and_si128(sign, one);
196 val = _mm_add_epi32(val, ones);
197 _mm_storeu_si128((__m128i*)p, val);
203 float delta,
ui32 count)
206 __m128i m1 = _mm_set1_epi32(INT_MAX);
207 __m128 d = _mm_set1_ps(delta);
208 float *p = (
float*)dp;
209 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
211 __m128i v = _mm_load_si128((__m128i*)sp);
212 __m128i vali = _mm_and_si128(v, m1);
213 __m128 valf = _mm_cvtepi32_ps(vali);
214 valf = _mm_mul_ps(valf, d);
215 __m128i sign = _mm_andnot_si128(m1, v);
216 valf = _mm_or_ps(valf, _mm_castsi128_ps(sign));
217 _mm_storeu_ps(p, valf);
223 float delta_inv,
ui32 count,
ui64* max_val)
228 ui32 shift = 63 - K_max;
229 __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
230 __m128i zero = _mm_setzero_si128();
231 __m128i one = _mm_set1_epi64x(1);
232 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
233 __m128i *p = (__m128i*)sp;
234 for ( ; count >= 2; count -= 2, p += 1, dp += 2)
236 __m128i v = _mm_loadu_si128(p);
237 __m128i sign = _mm_cmplt_epi32(v, zero);
238 sign = _mm_shuffle_epi32(sign, 0xF5);
239 __m128i val = _mm_xor_si128(v, sign);
240 __m128i ones = _mm_and_si128(sign, one);
241 val = _mm_add_epi64(val, ones);
242 sign = _mm_and_si128(sign, m0);
243 val = _mm_slli_epi64(val, (
int)shift);
244 tmax = _mm_or_si128(tmax, val);
245 val = _mm_or_si128(val, sign);
246 _mm_storeu_si128((__m128i*)dp, val);
250 __m128i v = _mm_loadu_si128(p);
251 __m128i sign = _mm_cmplt_epi32(v, zero);
252 sign = _mm_shuffle_epi32(sign, 0xF5);
253 __m128i val = _mm_xor_si128(v, sign);
254 __m128i ones = _mm_and_si128(sign, one);
255 val = _mm_add_epi64(val, ones);
256 sign = _mm_and_si128(sign, m0);
257 val = _mm_slli_epi64(val, (
int)shift);
259 __m128i c = _mm_set_epi32(0, 0, (
si32)0xFFFFFFFF, (
si32)0xFFFFFFFF);
260 c = _mm_and_si128(val, c);
261 tmax = _mm_or_si128(tmax, c);
263 val = _mm_or_si128(val, sign);
264 _mm_storeu_si128((__m128i*)dp, val);
266 _mm_storeu_si128((__m128i*)max_val, tmax);
271 float delta,
ui32 count)
274 ui32 shift = 63 - K_max;
275 __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
276 __m128i zero = _mm_setzero_si128();
277 __m128i one = _mm_set1_epi64x(1);
279 for (
ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
281 __m128i v = _mm_load_si128((__m128i*)sp);
282 __m128i val = _mm_and_si128(v, m1);
283 val = _mm_srli_epi64(val, (
int)shift);
284 __m128i sign = _mm_cmplt_epi32(v, zero);
285 sign = _mm_shuffle_epi32(sign, 0xF5);
286 val = _mm_xor_si128(val, sign);
287 __m128i ones = _mm_and_si128(sign, one);
288 val = _mm_add_epi64(val, ones);
289 _mm_storeu_si128((__m128i*)p, val);
void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui32 sse2_find_max_val32(ui32 *address)
ui64 sse2_find_max_val64(ui64 *address)
void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)