OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_sse2.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <cstdio>
43
44#include "ojph_defs.h"
45#include "ojph_mem.h"
46#include "ojph_params.h"
48
49#include "ojph_transform.h"
51
52#include <emmintrin.h>
53
54namespace ojph {
55 namespace local {
56
58 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
59 static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
60 {
61 // note than m must be obtained using
62 // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
63 __m128i x = _mm_srli_epi64(a, amt);
64 x = _mm_xor_si128(x, m);
65 __m128i result = _mm_sub_epi64(x, m);
66 return result;
67 }
68
70 static inline
71 void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width)
72 {
73 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
74 {
75 __m128 a = _mm_load_ps(sp);
76 __m128 b = _mm_load_ps(sp + 4);
77 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
78 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
79 _mm_store_ps(dpl, c);
80 _mm_store_ps(dph, d);
81 }
82 }
83
85 static inline
86 void sse2_interleave32(float* dp, float* spl, float* sph, int width) \
87 {
88 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
89 {
90 __m128 a = _mm_load_ps(spl);
91 __m128 b = _mm_load_ps(sph);
92 __m128 c = _mm_unpacklo_ps(a, b);
93 __m128 d = _mm_unpackhi_ps(a, b);
94 _mm_store_ps(dp, c);
95 _mm_store_ps(dp + 4, d);
96 }
97 }
98
100 static inline
101 void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width)
102 {
103 for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
104 {
105 __m128d a = _mm_load_pd(sp);
106 __m128d b = _mm_load_pd(sp + 2);
107 __m128d c = _mm_shuffle_pd(a, b, 0);
108 __m128d d = _mm_shuffle_pd(a, b, 3);
109 _mm_store_pd(dpl, c);
110 _mm_store_pd(dph, d);
111 }
112 }
113
115 static inline
116 void sse2_interleave64(double* dp, double* spl, double* sph, int width)
117 {
118 for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
119 {
120 __m128d a = _mm_load_pd(spl);
121 __m128d b = _mm_load_pd(sph);
122 __m128d c = _mm_unpacklo_pd(a, b);
123 __m128d d = _mm_unpackhi_pd(a, b);
124 _mm_store_pd(dp, c);
125 _mm_store_pd(dp + 2, d);
126 }
127 }
128
130 static
131 void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig,
132 const line_buf* other, const line_buf* aug,
133 ui32 repeat, bool synthesis)
134 {
135 const si32 a = s->rev.Aatk;
136 const si32 b = s->rev.Batk;
137 const ui8 e = s->rev.Eatk;
138 __m128i vb = _mm_set1_epi32(b);
139
140 si32* dst = aug->i32;
141 const si32* src1 = sig->i32, * src2 = other->i32;
142 // The general definition of the wavelet in Part 2 is slightly
143 // different to part 2, although they are mathematically equivalent
144 // here, we identify the simpler form from Part 1 and employ them
145 if (a == 1)
146 { // 5/3 update and any case with a == 1
147 int i = (int)repeat;
148 if (synthesis)
149 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
150 {
151 __m128i s1 = _mm_load_si128((__m128i*)src1);
152 __m128i s2 = _mm_load_si128((__m128i*)src2);
153 __m128i d = _mm_load_si128((__m128i*)dst);
154 __m128i t = _mm_add_epi32(s1, s2);
155 __m128i v = _mm_add_epi32(vb, t);
156 __m128i w = _mm_srai_epi32(v, e);
157 d = _mm_sub_epi32(d, w);
158 _mm_store_si128((__m128i*)dst, d);
159 }
160 else
161 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
162 {
163 __m128i s1 = _mm_load_si128((__m128i*)src1);
164 __m128i s2 = _mm_load_si128((__m128i*)src2);
165 __m128i d = _mm_load_si128((__m128i*)dst);
166 __m128i t = _mm_add_epi32(s1, s2);
167 __m128i v = _mm_add_epi32(vb, t);
168 __m128i w = _mm_srai_epi32(v, e);
169 d = _mm_add_epi32(d, w);
170 _mm_store_si128((__m128i*)dst, d);
171 }
172 }
173 else if (a == -1 && b == 1 && e == 1)
174 { // 5/3 predict
175 int i = (int)repeat;
176 if (synthesis)
177 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
178 {
179 __m128i s1 = _mm_load_si128((__m128i*)src1);
180 __m128i s2 = _mm_load_si128((__m128i*)src2);
181 __m128i d = _mm_load_si128((__m128i*)dst);
182 __m128i t = _mm_add_epi32(s1, s2);
183 __m128i w = _mm_srai_epi32(t, e);
184 d = _mm_add_epi32(d, w);
185 _mm_store_si128((__m128i*)dst, d);
186 }
187 else
188 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
189 {
190 __m128i s1 = _mm_load_si128((__m128i*)src1);
191 __m128i s2 = _mm_load_si128((__m128i*)src2);
192 __m128i d = _mm_load_si128((__m128i*)dst);
193 __m128i t = _mm_add_epi32(s1, s2);
194 __m128i w = _mm_srai_epi32(t, e);
195 d = _mm_sub_epi32(d, w);
196 _mm_store_si128((__m128i*)dst, d);
197 }
198 }
199 else if (a == -1)
200 { // any case with a == -1, which is not 5/3 predict
201 int i = (int)repeat;
202 if (synthesis)
203 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
204 {
205 __m128i s1 = _mm_load_si128((__m128i*)src1);
206 __m128i s2 = _mm_load_si128((__m128i*)src2);
207 __m128i d = _mm_load_si128((__m128i*)dst);
208 __m128i t = _mm_add_epi32(s1, s2);
209 __m128i v = _mm_sub_epi32(vb, t);
210 __m128i w = _mm_srai_epi32(v, e);
211 d = _mm_sub_epi32(d, w);
212 _mm_store_si128((__m128i*)dst, d);
213 }
214 else
215 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
216 {
217 __m128i s1 = _mm_load_si128((__m128i*)src1);
218 __m128i s2 = _mm_load_si128((__m128i*)src2);
219 __m128i d = _mm_load_si128((__m128i*)dst);
220 __m128i t = _mm_add_epi32(s1, s2);
221 __m128i v = _mm_sub_epi32(vb, t);
222 __m128i w = _mm_srai_epi32(v, e);
223 d = _mm_add_epi32(d, w);
224 _mm_store_si128((__m128i*)dst, d);
225 }
226 }
227 else { // general case
228 // 32bit multiplication is not supported in sse2; we need sse4.1,
229 // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
230 // keeping the LSBs
231 if (synthesis)
232 for (ui32 i = repeat; i > 0; --i)
233 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
234 else
235 for (ui32 i = repeat; i > 0; --i)
236 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
237 }
238 }
239
241 static
242 void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig,
243 const line_buf* other, const line_buf* aug,
244 ui32 repeat, bool synthesis)
245 {
246 const si64 a = s->rev.Aatk;
247 const si64 b = s->rev.Batk;
248 const ui8 e = s->rev.Eatk;
249 __m128i vb = _mm_set1_epi64x(b);
250 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
251
252 si64* dst = aug->i64;
253 const si64* src1 = sig->i64, * src2 = other->i64;
254 // The general definition of the wavelet in Part 2 is slightly
255 // different to part 2, although they are mathematically equivalent
256 // here, we identify the simpler form from Part 1 and employ them
257 if (a == 1)
258 { // 5/3 update and any case with a == 1
259 int i = (int)repeat;
260 if (synthesis)
261 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
262 {
263 __m128i s1 = _mm_load_si128((__m128i*)src1);
264 __m128i s2 = _mm_load_si128((__m128i*)src2);
265 __m128i d = _mm_load_si128((__m128i*)dst);
266 __m128i t = _mm_add_epi64(s1, s2);
267 __m128i v = _mm_add_epi64(vb, t);
268 __m128i w = sse2_mm_srai_epi64(v, e, ve);
269 d = _mm_sub_epi64(d, w);
270 _mm_store_si128((__m128i*)dst, d);
271 }
272 else
273 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
274 {
275 __m128i s1 = _mm_load_si128((__m128i*)src1);
276 __m128i s2 = _mm_load_si128((__m128i*)src2);
277 __m128i d = _mm_load_si128((__m128i*)dst);
278 __m128i t = _mm_add_epi64(s1, s2);
279 __m128i v = _mm_add_epi64(vb, t);
280 __m128i w = sse2_mm_srai_epi64(v, e, ve);
281 d = _mm_add_epi64(d, w);
282 _mm_store_si128((__m128i*)dst, d);
283 }
284 }
285 else if (a == -1 && b == 1 && e == 1)
286 { // 5/3 predict
287 int i = (int)repeat;
288 if (synthesis)
289 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
290 {
291 __m128i s1 = _mm_load_si128((__m128i*)src1);
292 __m128i s2 = _mm_load_si128((__m128i*)src2);
293 __m128i d = _mm_load_si128((__m128i*)dst);
294 __m128i t = _mm_add_epi64(s1, s2);
295 __m128i w = sse2_mm_srai_epi64(t, e, ve);
296 d = _mm_add_epi64(d, w);
297 _mm_store_si128((__m128i*)dst, d);
298 }
299 else
300 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
301 {
302 __m128i s1 = _mm_load_si128((__m128i*)src1);
303 __m128i s2 = _mm_load_si128((__m128i*)src2);
304 __m128i d = _mm_load_si128((__m128i*)dst);
305 __m128i t = _mm_add_epi64(s1, s2);
306 __m128i w = sse2_mm_srai_epi64(t, e, ve);
307 d = _mm_sub_epi64(d, w);
308 _mm_store_si128((__m128i*)dst, d);
309 }
310 }
311 else if (a == -1)
312 { // any case with a == -1, which is not 5/3 predict
313 int i = (int)repeat;
314 if (synthesis)
315 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
316 {
317 __m128i s1 = _mm_load_si128((__m128i*)src1);
318 __m128i s2 = _mm_load_si128((__m128i*)src2);
319 __m128i d = _mm_load_si128((__m128i*)dst);
320 __m128i t = _mm_add_epi64(s1, s2);
321 __m128i v = _mm_sub_epi64(vb, t);
322 __m128i w = sse2_mm_srai_epi64(v, e, ve);
323 d = _mm_sub_epi64(d, w);
324 _mm_store_si128((__m128i*)dst, d);
325 }
326 else
327 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
328 {
329 __m128i s1 = _mm_load_si128((__m128i*)src1);
330 __m128i s2 = _mm_load_si128((__m128i*)src2);
331 __m128i d = _mm_load_si128((__m128i*)dst);
332 __m128i t = _mm_add_epi64(s1, s2);
333 __m128i v = _mm_sub_epi64(vb, t);
334 __m128i w = sse2_mm_srai_epi64(v, e, ve);
335 d = _mm_add_epi64(d, w);
336 _mm_store_si128((__m128i*)dst, d);
337 }
338 }
339 else { // general case
340 // 64bit multiplication is not supported in sse2
341 if (synthesis)
342 for (ui32 i = repeat; i > 0; --i)
343 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
344 else
345 for (ui32 i = repeat; i > 0; --i)
346 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
347 }
348 }
349
351 void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig,
352 const line_buf* other, const line_buf* aug,
353 ui32 repeat, bool synthesis)
354 {
355 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
356 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
357 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
358 {
359 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
360 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
361 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
362 sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
363 }
364 else
365 {
366 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
367 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
368 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
369 sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
370 }
371 }
372
374 static
375 void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
376 const line_buf* hdst, const line_buf* src,
377 ui32 width, bool even)
378 {
379 if (width > 1)
380 {
381 // split src into ldst and hdst
382 {
383 float* dpl = even ? ldst->f32 : hdst->f32;
384 float* dph = even ? hdst->f32 : ldst->f32;
385 float* sp = src->f32;
386 int w = (int)width;
387 sse2_deinterleave32(dpl, dph, sp, w);
388 }
389
390 si32* hp = hdst->i32, * lp = ldst->i32;
391 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
392 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
393 ui32 num_steps = atk->get_num_steps();
394 for (ui32 j = num_steps; j > 0; --j)
395 {
396 // first lifting step
397 const lifting_step* s = atk->get_step(j - 1);
398 const si32 a = s->rev.Aatk;
399 const si32 b = s->rev.Batk;
400 const ui8 e = s->rev.Eatk;
401 __m128i vb = _mm_set1_epi32(b);
402
403 // extension
404 lp[-1] = lp[0];
405 lp[l_width] = lp[l_width - 1];
406 // lifting step
407 const si32* sp = lp;
408 si32* dp = hp;
409 if (a == 1)
410 { // 5/3 update and any case with a == 1
411 int i = (int)h_width;
412 if (even)
413 {
414 for (; i > 0; i -= 4, sp += 4, dp += 4)
415 {
416 __m128i s1 = _mm_load_si128((__m128i*)sp);
417 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
418 __m128i d = _mm_load_si128((__m128i*)dp);
419 __m128i t = _mm_add_epi32(s1, s2);
420 __m128i v = _mm_add_epi32(vb, t);
421 __m128i w = _mm_srai_epi32(v, e);
422 d = _mm_add_epi32(d, w);
423 _mm_store_si128((__m128i*)dp, d);
424 }
425 }
426 else
427 {
428 for (; i > 0; i -= 4, sp += 4, dp += 4)
429 {
430 __m128i s1 = _mm_load_si128((__m128i*)sp);
431 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
432 __m128i d = _mm_load_si128((__m128i*)dp);
433 __m128i t = _mm_add_epi32(s1, s2);
434 __m128i v = _mm_add_epi32(vb, t);
435 __m128i w = _mm_srai_epi32(v, e);
436 d = _mm_add_epi32(d, w);
437 _mm_store_si128((__m128i*)dp, d);
438 }
439 }
440 }
441 else if (a == -1 && b == 1 && e == 1)
442 { // 5/3 predict
443 int i = (int)h_width;
444 if (even)
445 for (; i > 0; i -= 4, sp += 4, dp += 4)
446 {
447 __m128i s1 = _mm_load_si128((__m128i*)sp);
448 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
449 __m128i d = _mm_load_si128((__m128i*)dp);
450 __m128i t = _mm_add_epi32(s1, s2);
451 __m128i w = _mm_srai_epi32(t, e);
452 d = _mm_sub_epi32(d, w);
453 _mm_store_si128((__m128i*)dp, d);
454 }
455 else
456 for (; i > 0; i -= 4, sp += 4, dp += 4)
457 {
458 __m128i s1 = _mm_load_si128((__m128i*)sp);
459 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
460 __m128i d = _mm_load_si128((__m128i*)dp);
461 __m128i t = _mm_add_epi32(s1, s2);
462 __m128i w = _mm_srai_epi32(t, e);
463 d = _mm_sub_epi32(d, w);
464 _mm_store_si128((__m128i*)dp, d);
465 }
466 }
467 else if (a == -1)
468 { // any case with a == -1, which is not 5/3 predict
469 int i = (int)h_width;
470 if (even)
471 for (; i > 0; i -= 4, sp += 4, dp += 4)
472 {
473 __m128i s1 = _mm_load_si128((__m128i*)sp);
474 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
475 __m128i d = _mm_load_si128((__m128i*)dp);
476 __m128i t = _mm_add_epi32(s1, s2);
477 __m128i v = _mm_sub_epi32(vb, t);
478 __m128i w = _mm_srai_epi32(v, e);
479 d = _mm_add_epi32(d, w);
480 _mm_store_si128((__m128i*)dp, d);
481 }
482 else
483 for (; i > 0; i -= 4, sp += 4, dp += 4)
484 {
485 __m128i s1 = _mm_load_si128((__m128i*)sp);
486 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
487 __m128i d = _mm_load_si128((__m128i*)dp);
488 __m128i t = _mm_add_epi32(s1, s2);
489 __m128i v = _mm_sub_epi32(vb, t);
490 __m128i w = _mm_srai_epi32(v, e);
491 d = _mm_add_epi32(d, w);
492 _mm_store_si128((__m128i*)dp, d);
493 }
494 }
495 else {
496 // general case
497 // 64bit multiplication is not supported in sse2
498 if (even)
499 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
500 *dp += (b + a * (sp[0] + sp[1])) >> e;
501 else
502 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
503 *dp += (b + a * (sp[-1] + sp[0])) >> e;
504 }
505
506 // swap buffers
507 si32* t = lp; lp = hp; hp = t;
508 even = !even;
509 ui32 w = l_width; l_width = h_width; h_width = w;
510 }
511 }
512 else {
513 if (even)
514 ldst->i32[0] = src->i32[0];
515 else
516 hdst->i32[0] = src->i32[0] << 1;
517 }
518 }
519
521 static
522 void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
523 const line_buf* hdst, const line_buf* src,
524 ui32 width, bool even)
525 {
526 if (width > 1)
527 {
528 // split src into ldst and hdst
529 {
530 double* dpl = (double*)(even ? ldst->p : hdst->p);
531 double* dph = (double*)(even ? hdst->p : ldst->p);
532 double* sp = (double*)src->p;
533 int w = (int)width;
534 sse2_deinterleave64(dpl, dph, sp, w);
535 }
536
537 si64* hp = hdst->i64, * lp = ldst->i64;
538 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
539 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
540 ui32 num_steps = atk->get_num_steps();
541 for (ui32 j = num_steps; j > 0; --j)
542 {
543 // first lifting step
544 const lifting_step* s = atk->get_step(j - 1);
545 const si32 a = s->rev.Aatk;
546 const si32 b = s->rev.Batk;
547 const ui8 e = s->rev.Eatk;
548 __m128i vb = _mm_set1_epi64x(b);
549 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
550
551 // extension
552 lp[-1] = lp[0];
553 lp[l_width] = lp[l_width - 1];
554 // lifting step
555 const si64* sp = lp;
556 si64* dp = hp;
557 if (a == 1)
558 { // 5/3 update and any case with a == 1
559 int i = (int)h_width;
560 if (even)
561 {
562 for (; i > 0; i -= 2, sp += 2, dp += 2)
563 {
564 __m128i s1 = _mm_load_si128((__m128i*)sp);
565 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
566 __m128i d = _mm_load_si128((__m128i*)dp);
567 __m128i t = _mm_add_epi64(s1, s2);
568 __m128i v = _mm_add_epi64(vb, t);
569 __m128i w = sse2_mm_srai_epi64(v, e, ve);
570 d = _mm_add_epi64(d, w);
571 _mm_store_si128((__m128i*)dp, d);
572 }
573 }
574 else
575 {
576 for (; i > 0; i -= 2, sp += 2, dp += 2)
577 {
578 __m128i s1 = _mm_load_si128((__m128i*)sp);
579 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
580 __m128i d = _mm_load_si128((__m128i*)dp);
581 __m128i t = _mm_add_epi64(s1, s2);
582 __m128i v = _mm_add_epi64(vb, t);
583 __m128i w = sse2_mm_srai_epi64(v, e, ve);
584 d = _mm_add_epi64(d, w);
585 _mm_store_si128((__m128i*)dp, d);
586 }
587 }
588 }
589 else if (a == -1 && b == 1 && e == 1)
590 { // 5/3 predict
591 int i = (int)h_width;
592 if (even)
593 for (; i > 0; i -= 2, sp += 2, dp += 2)
594 {
595 __m128i s1 = _mm_load_si128((__m128i*)sp);
596 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
597 __m128i d = _mm_load_si128((__m128i*)dp);
598 __m128i t = _mm_add_epi64(s1, s2);
599 __m128i w = sse2_mm_srai_epi64(t, e, ve);
600 d = _mm_sub_epi64(d, w);
601 _mm_store_si128((__m128i*)dp, d);
602 }
603 else
604 for (; i > 0; i -= 2, sp += 2, dp += 2)
605 {
606 __m128i s1 = _mm_load_si128((__m128i*)sp);
607 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
608 __m128i d = _mm_load_si128((__m128i*)dp);
609 __m128i t = _mm_add_epi64(s1, s2);
610 __m128i w = sse2_mm_srai_epi64(t, e, ve);
611 d = _mm_sub_epi64(d, w);
612 _mm_store_si128((__m128i*)dp, d);
613 }
614 }
615 else if (a == -1)
616 { // any case with a == -1, which is not 5/3 predict
617 int i = (int)h_width;
618 if (even)
619 for (; i > 0; i -= 2, sp += 2, dp += 2)
620 {
621 __m128i s1 = _mm_load_si128((__m128i*)sp);
622 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
623 __m128i d = _mm_load_si128((__m128i*)dp);
624 __m128i t = _mm_add_epi64(s1, s2);
625 __m128i v = _mm_sub_epi64(vb, t);
626 __m128i w = sse2_mm_srai_epi64(v, e, ve);
627 d = _mm_add_epi64(d, w);
628 _mm_store_si128((__m128i*)dp, d);
629 }
630 else
631 for (; i > 0; i -= 2, sp += 2, dp += 2)
632 {
633 __m128i s1 = _mm_load_si128((__m128i*)sp);
634 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
635 __m128i d = _mm_load_si128((__m128i*)dp);
636 __m128i t = _mm_add_epi64(s1, s2);
637 __m128i v = _mm_sub_epi64(vb, t);
638 __m128i w = sse2_mm_srai_epi64(v, e, ve);
639 d = _mm_add_epi64(d, w);
640 _mm_store_si128((__m128i*)dp, d);
641 }
642 }
643 else {
644 // general case
645 // 64bit multiplication is not supported in sse2
646 if (even)
647 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
648 *dp += (b + a * (sp[0] + sp[1])) >> e;
649 else
650 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
651 *dp += (b + a * (sp[-1] + sp[0])) >> e;
652 }
653
654 // swap buffers
655 si64* t = lp; lp = hp; hp = t;
656 even = !even;
657 ui32 w = l_width; l_width = h_width; h_width = w;
658 }
659 }
660 else {
661 if (even)
662 ldst->i64[0] = src->i64[0];
663 else
664 hdst->i64[0] = src->i64[0] << 1;
665 }
666 }
667
669 void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
670 const line_buf* hdst, const line_buf* src,
671 ui32 width, bool even)
672 {
673 if (src->flags & line_buf::LFT_32BIT)
674 {
675 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
676 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
677 sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
678 }
679 else
680 {
681 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
682 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
683 (src == NULL || src->flags & line_buf::LFT_64BIT));
684 sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
685 }
686 }
687
689 void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
690 const line_buf* lsrc, const line_buf* hsrc,
691 ui32 width, bool even)
692 {
693 if (width > 1)
694 {
695 bool ev = even;
696 si32* oth = hsrc->i32, * aug = lsrc->i32;
697 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
698 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
699 ui32 num_steps = atk->get_num_steps();
700 for (ui32 j = 0; j < num_steps; ++j)
701 {
702 const lifting_step* s = atk->get_step(j);
703 const si32 a = s->rev.Aatk;
704 const si32 b = s->rev.Batk;
705 const ui8 e = s->rev.Eatk;
706 __m128i vb = _mm_set1_epi32(b);
707
708 // extension
709 oth[-1] = oth[0];
710 oth[oth_width] = oth[oth_width - 1];
711 // lifting step
712 const si32* sp = oth;
713 si32* dp = aug;
714 if (a == 1)
715 { // 5/3 update and any case with a == 1
716 int i = (int)aug_width;
717 if (ev)
718 {
719 for (; i > 0; i -= 4, sp += 4, dp += 4)
720 {
721 __m128i s1 = _mm_load_si128((__m128i*)sp);
722 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
723 __m128i d = _mm_load_si128((__m128i*)dp);
724 __m128i t = _mm_add_epi32(s1, s2);
725 __m128i v = _mm_add_epi32(vb, t);
726 __m128i w = _mm_srai_epi32(v, e);
727 d = _mm_sub_epi32(d, w);
728 _mm_store_si128((__m128i*)dp, d);
729 }
730 }
731 else
732 {
733 for (; i > 0; i -= 4, sp += 4, dp += 4)
734 {
735 __m128i s1 = _mm_load_si128((__m128i*)sp);
736 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
737 __m128i d = _mm_load_si128((__m128i*)dp);
738 __m128i t = _mm_add_epi32(s1, s2);
739 __m128i v = _mm_add_epi32(vb, t);
740 __m128i w = _mm_srai_epi32(v, e);
741 d = _mm_sub_epi32(d, w);
742 _mm_store_si128((__m128i*)dp, d);
743 }
744 }
745 }
746 else if (a == -1 && b == 1 && e == 1)
747 { // 5/3 predict
748 int i = (int)aug_width;
749 if (ev)
750 for (; i > 0; i -= 4, sp += 4, dp += 4)
751 {
752 __m128i s1 = _mm_load_si128((__m128i*)sp);
753 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
754 __m128i d = _mm_load_si128((__m128i*)dp);
755 __m128i t = _mm_add_epi32(s1, s2);
756 __m128i w = _mm_srai_epi32(t, e);
757 d = _mm_add_epi32(d, w);
758 _mm_store_si128((__m128i*)dp, d);
759 }
760 else
761 for (; i > 0; i -= 4, sp += 4, dp += 4)
762 {
763 __m128i s1 = _mm_load_si128((__m128i*)sp);
764 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
765 __m128i d = _mm_load_si128((__m128i*)dp);
766 __m128i t = _mm_add_epi32(s1, s2);
767 __m128i w = _mm_srai_epi32(t, e);
768 d = _mm_add_epi32(d, w);
769 _mm_store_si128((__m128i*)dp, d);
770 }
771 }
772 else if (a == -1)
773 { // any case with a == -1, which is not 5/3 predict
774 int i = (int)aug_width;
775 if (ev)
776 for (; i > 0; i -= 4, sp += 4, dp += 4)
777 {
778 __m128i s1 = _mm_load_si128((__m128i*)sp);
779 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
780 __m128i d = _mm_load_si128((__m128i*)dp);
781 __m128i t = _mm_add_epi32(s1, s2);
782 __m128i v = _mm_sub_epi32(vb, t);
783 __m128i w = _mm_srai_epi32(v, e);
784 d = _mm_sub_epi32(d, w);
785 _mm_store_si128((__m128i*)dp, d);
786 }
787 else
788 for (; i > 0; i -= 4, sp += 4, dp += 4)
789 {
790 __m128i s1 = _mm_load_si128((__m128i*)sp);
791 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
792 __m128i d = _mm_load_si128((__m128i*)dp);
793 __m128i t = _mm_add_epi32(s1, s2);
794 __m128i v = _mm_sub_epi32(vb, t);
795 __m128i w = _mm_srai_epi32(v, e);
796 d = _mm_sub_epi32(d, w);
797 _mm_store_si128((__m128i*)dp, d);
798 }
799 }
800 else {
801 // general case
802 // 32bit multiplication is not supported in sse2; we need sse4.1,
803 // where we can use _mm_mullo_epi32, which multiplies
804 // 32bit x 32bit, keeping the LSBs
805 if (ev)
806 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
807 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
808 else
809 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
810 *dp -= (b + a * (sp[0] + sp[1])) >> e;
811 }
812
813 // swap buffers
814 si32* t = aug; aug = oth; oth = t;
815 ev = !ev;
816 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
817 }
818
819 // combine both lsrc and hsrc into dst
820 {
821 float* dp = dst->f32;
822 float* spl = even ? lsrc->f32 : hsrc->f32;
823 float* sph = even ? hsrc->f32 : lsrc->f32;
824 int w = (int)width;
825 sse2_interleave32(dp, spl, sph, w);
826 }
827 }
828 else {
829 if (even)
830 dst->i32[0] = lsrc->i32[0];
831 else
832 dst->i32[0] = hsrc->i32[0] >> 1;
833 }
834 }
835
837 void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
838 const line_buf* lsrc, const line_buf* hsrc,
839 ui32 width, bool even)
840 {
841 if (width > 1)
842 {
843 bool ev = even;
844 si64* oth = hsrc->i64, * aug = lsrc->i64;
845 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
846 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
847 ui32 num_steps = atk->get_num_steps();
848 for (ui32 j = 0; j < num_steps; ++j)
849 {
850 const lifting_step* s = atk->get_step(j);
851 const si32 a = s->rev.Aatk;
852 const si32 b = s->rev.Batk;
853 const ui8 e = s->rev.Eatk;
854 __m128i vb = _mm_set1_epi64x(b);
855 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
856
857 // extension
858 oth[-1] = oth[0];
859 oth[oth_width] = oth[oth_width - 1];
860 // lifting step
861 const si64* sp = oth;
862 si64* dp = aug;
863 if (a == 1)
864 { // 5/3 update and any case with a == 1
865 int i = (int)aug_width;
866 if (ev)
867 {
868 for (; i > 0; i -= 2, sp += 2, dp += 2)
869 {
870 __m128i s1 = _mm_load_si128((__m128i*)sp);
871 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
872 __m128i d = _mm_load_si128((__m128i*)dp);
873 __m128i t = _mm_add_epi64(s1, s2);
874 __m128i v = _mm_add_epi64(vb, t);
875 __m128i w = sse2_mm_srai_epi64(v, e, ve);
876 d = _mm_sub_epi64(d, w);
877 _mm_store_si128((__m128i*)dp, d);
878 }
879 }
880 else
881 {
882 for (; i > 0; i -= 2, sp += 2, dp += 2)
883 {
884 __m128i s1 = _mm_load_si128((__m128i*)sp);
885 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
886 __m128i d = _mm_load_si128((__m128i*)dp);
887 __m128i t = _mm_add_epi64(s1, s2);
888 __m128i v = _mm_add_epi64(vb, t);
889 __m128i w = sse2_mm_srai_epi64(v, e, ve);
890 d = _mm_sub_epi64(d, w);
891 _mm_store_si128((__m128i*)dp, d);
892 }
893 }
894 }
895 else if (a == -1 && b == 1 && e == 1)
896 { // 5/3 predict
897 int i = (int)aug_width;
898 if (ev)
899 for (; i > 0; i -= 2, sp += 2, dp += 2)
900 {
901 __m128i s1 = _mm_load_si128((__m128i*)sp);
902 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
903 __m128i d = _mm_load_si128((__m128i*)dp);
904 __m128i t = _mm_add_epi64(s1, s2);
905 __m128i w = sse2_mm_srai_epi64(t, e, ve);
906 d = _mm_add_epi64(d, w);
907 _mm_store_si128((__m128i*)dp, d);
908 }
909 else
910 for (; i > 0; i -= 2, sp += 2, dp += 2)
911 {
912 __m128i s1 = _mm_load_si128((__m128i*)sp);
913 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
914 __m128i d = _mm_load_si128((__m128i*)dp);
915 __m128i t = _mm_add_epi64(s1, s2);
916 __m128i w = sse2_mm_srai_epi64(t, e, ve);
917 d = _mm_add_epi64(d, w);
918 _mm_store_si128((__m128i*)dp, d);
919 }
920 }
921 else if (a == -1)
922 { // any case with a == -1, which is not 5/3 predict
923 int i = (int)aug_width;
924 if (ev)
925 for (; i > 0; i -= 2, sp += 2, dp += 2)
926 {
927 __m128i s1 = _mm_load_si128((__m128i*)sp);
928 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
929 __m128i d = _mm_load_si128((__m128i*)dp);
930 __m128i t = _mm_add_epi64(s1, s2);
931 __m128i v = _mm_sub_epi64(vb, t);
932 __m128i w = sse2_mm_srai_epi64(v, e, ve);
933 d = _mm_sub_epi64(d, w);
934 _mm_store_si128((__m128i*)dp, d);
935 }
936 else
937 for (; i > 0; i -= 2, sp += 2, dp += 2)
938 {
939 __m128i s1 = _mm_load_si128((__m128i*)sp);
940 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
941 __m128i d = _mm_load_si128((__m128i*)dp);
942 __m128i t = _mm_add_epi64(s1, s2);
943 __m128i v = _mm_sub_epi64(vb, t);
944 __m128i w = sse2_mm_srai_epi64(v, e, ve);
945 d = _mm_sub_epi64(d, w);
946 _mm_store_si128((__m128i*)dp, d);
947 }
948 }
949 else {
950 // general case
951 // 64bit multiplication is not supported in sse2
952 if (ev)
953 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
954 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
955 else
956 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
957 *dp -= (b + a * (sp[0] + sp[1])) >> e;
958 }
959
960 // swap buffers
961 si64* t = aug; aug = oth; oth = t;
962 ev = !ev;
963 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
964 }
965
966 // combine both lsrc and hsrc into dst
967 {
968 double* dp = (double*)dst->p;
969 double* spl = (double*)(even ? lsrc->p : hsrc->p);
970 double* sph = (double*)(even ? hsrc->p : lsrc->p);
971 int w = (int)width;
972 sse2_interleave64(dp, spl, sph, w);
973 }
974 }
975 else {
976 if (even)
977 dst->i64[0] = lsrc->i64[0];
978 else
979 dst->i64[0] = hsrc->i64[0] >> 1;
980 }
981 }
982
984 void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
985 const line_buf* lsrc, const line_buf* hsrc,
986 ui32 width, bool even)
987 {
988 if (dst->flags & line_buf::LFT_32BIT)
989 {
990 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
991 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
992 sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
993 }
994 else
995 {
996 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
997 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
998 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
999 sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
1000 }
1001 }
1002
1003 } // !local
1004} // !ojph
1005
1006#endif
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50