OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_avx2.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <cstdio>
43
44#include "ojph_defs.h"
45#include "ojph_mem.h"
46#include "ojph_params.h"
48
49#include "ojph_transform.h"
51
52#include <immintrin.h>
53
54namespace ojph {
55 namespace local {
56
58 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
59 static inline
60 __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
61 {
62 // note than m must be obtained using
63 // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
64 __m256i x = _mm256_srli_epi64(a, amt);
65 x = _mm256_xor_si256(x, m);
66 __m256i result = _mm256_sub_epi64(x, m);
67 return result;
68 }
69
71 static inline
72 void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width)
73 {
74 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
75 {
76 __m256 a = _mm256_load_ps(sp);
77 __m256 b = _mm256_load_ps(sp + 8);
78 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
79 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
80 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
81 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
82 _mm256_store_ps(dpl, e);
83 _mm256_store_ps(dph, f);
84 }
85 }
86
88 static inline
89 void avx2_interleave32(float* dp, float* spl, float* sph, int width)
90 {
91 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
92 {
93 __m256 a = _mm256_load_ps(spl);
94 __m256 b = _mm256_load_ps(sph);
95 __m256 c = _mm256_unpacklo_ps(a, b);
96 __m256 d = _mm256_unpackhi_ps(a, b);
97 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
98 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
99 _mm256_store_ps(dp, e);
100 _mm256_store_ps(dp + 8, f);
101 }
102 }
103
105 static inline
106 void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width)
107 {
108 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
109 {
110 __m256d a = _mm256_load_pd(sp);
111 __m256d b = _mm256_load_pd(sp + 4);
112 __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
113 __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
114 __m256d e = _mm256_shuffle_pd(c, d, 0x0);
115 __m256d f = _mm256_shuffle_pd(c, d, 0xF);
116 _mm256_store_pd(dpl, e);
117 _mm256_store_pd(dph, f);
118 }
119 }
120
122 static inline
123 void avx2_interleave64(double* dp, double* spl, double* sph, int width)
124 {
125 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
126 {
127 __m256d a = _mm256_load_pd(spl);
128 __m256d b = _mm256_load_pd(sph);
129 __m256d c = _mm256_unpacklo_pd(a, b);
130 __m256d d = _mm256_unpackhi_pd(a, b);
131 __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
132 __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
133 _mm256_store_pd(dp, e);
134 _mm256_store_pd(dp + 4, f);
135 }
136 }
137
139 static
140 void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig,
141 const line_buf* other, const line_buf* aug,
142 ui32 repeat, bool synthesis)
143 {
144 const si32 a = s->rev.Aatk;
145 const si32 b = s->rev.Batk;
146 const ui8 e = s->rev.Eatk;
147 __m256i va = _mm256_set1_epi32(a);
148 __m256i vb = _mm256_set1_epi32(b);
149
150 si32* dst = aug->i32;
151 const si32* src1 = sig->i32, * src2 = other->i32;
152 // The general definition of the wavelet in Part 2 is slightly
153 // different to part 2, although they are mathematically equivalent
154 // here, we identify the simpler form from Part 1 and employ them
155 if (a == 1)
156 { // 5/3 update and any case with a == 1
157 int i = (int)repeat;
158 if (synthesis)
159 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
160 {
161 __m256i s1 = _mm256_load_si256((__m256i*)src1);
162 __m256i s2 = _mm256_load_si256((__m256i*)src2);
163 __m256i d = _mm256_load_si256((__m256i*)dst);
164 __m256i t = _mm256_add_epi32(s1, s2);
165 __m256i v = _mm256_add_epi32(vb, t);
166 __m256i w = _mm256_srai_epi32(v, e);
167 d = _mm256_sub_epi32(d, w);
168 _mm256_store_si256((__m256i*)dst, d);
169 }
170 else
171 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
172 {
173 __m256i s1 = _mm256_load_si256((__m256i*)src1);
174 __m256i s2 = _mm256_load_si256((__m256i*)src2);
175 __m256i d = _mm256_load_si256((__m256i*)dst);
176 __m256i t = _mm256_add_epi32(s1, s2);
177 __m256i v = _mm256_add_epi32(vb, t);
178 __m256i w = _mm256_srai_epi32(v, e);
179 d = _mm256_add_epi32(d, w);
180 _mm256_store_si256((__m256i*)dst, d);
181 }
182 }
183 else if (a == -1 && b == 1 && e == 1)
184 { // 5/3 predict
185 int i = (int)repeat;
186 if (synthesis)
187 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
188 {
189 __m256i s1 = _mm256_load_si256((__m256i*)src1);
190 __m256i s2 = _mm256_load_si256((__m256i*)src2);
191 __m256i d = _mm256_load_si256((__m256i*)dst);
192 __m256i t = _mm256_add_epi32(s1, s2);
193 __m256i w = _mm256_srai_epi32(t, e);
194 d = _mm256_add_epi32(d, w);
195 _mm256_store_si256((__m256i*)dst, d);
196 }
197 else
198 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
199 {
200 __m256i s1 = _mm256_load_si256((__m256i*)src1);
201 __m256i s2 = _mm256_load_si256((__m256i*)src2);
202 __m256i d = _mm256_load_si256((__m256i*)dst);
203 __m256i t = _mm256_add_epi32(s1, s2);
204 __m256i w = _mm256_srai_epi32(t, e);
205 d = _mm256_sub_epi32(d, w);
206 _mm256_store_si256((__m256i*)dst, d);
207 }
208 }
209 else if (a == -1)
210 { // any case with a == -1, which is not 5/3 predict
211 int i = (int)repeat;
212 if (synthesis)
213 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
214 {
215 __m256i s1 = _mm256_load_si256((__m256i*)src1);
216 __m256i s2 = _mm256_load_si256((__m256i*)src2);
217 __m256i d = _mm256_load_si256((__m256i*)dst);
218 __m256i t = _mm256_add_epi32(s1, s2);
219 __m256i v = _mm256_sub_epi32(vb, t);
220 __m256i w = _mm256_srai_epi32(v, e);
221 d = _mm256_sub_epi32(d, w);
222 _mm256_store_si256((__m256i*)dst, d);
223 }
224 else
225 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
226 {
227 __m256i s1 = _mm256_load_si256((__m256i*)src1);
228 __m256i s2 = _mm256_load_si256((__m256i*)src2);
229 __m256i d = _mm256_load_si256((__m256i*)dst);
230 __m256i t = _mm256_add_epi32(s1, s2);
231 __m256i v = _mm256_sub_epi32(vb, t);
232 __m256i w = _mm256_srai_epi32(v, e);
233 d = _mm256_add_epi32(d, w);
234 _mm256_store_si256((__m256i*)dst, d);
235 }
236 }
237 else { // general case
238 int i = (int)repeat;
239 if (synthesis)
240 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
241 {
242 __m256i s1 = _mm256_load_si256((__m256i*)src1);
243 __m256i s2 = _mm256_load_si256((__m256i*)src2);
244 __m256i d = _mm256_load_si256((__m256i*)dst);
245 __m256i t = _mm256_add_epi32(s1, s2);
246 __m256i u = _mm256_mullo_epi32(va, t);
247 __m256i v = _mm256_add_epi32(vb, u);
248 __m256i w = _mm256_srai_epi32(v, e);
249 d = _mm256_sub_epi32(d, w);
250 _mm256_store_si256((__m256i*)dst, d);
251 }
252 else
253 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
254 {
255 __m256i s1 = _mm256_load_si256((__m256i*)src1);
256 __m256i s2 = _mm256_load_si256((__m256i*)src2);
257 __m256i d = _mm256_load_si256((__m256i*)dst);
258 __m256i t = _mm256_add_epi32(s1, s2);
259 __m256i u = _mm256_mullo_epi32(va, t);
260 __m256i v = _mm256_add_epi32(vb, u);
261 __m256i w = _mm256_srai_epi32(v, e);
262 d = _mm256_add_epi32(d, w);
263 _mm256_store_si256((__m256i*)dst, d);
264 }
265 }
266 }
267
269 static
270 void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig,
271 const line_buf* other, const line_buf* aug,
272 ui32 repeat, bool synthesis)
273 {
274 const si32 a = s->rev.Aatk;
275 const si32 b = s->rev.Batk;
276 const ui8 e = s->rev.Eatk;
277 __m256i vb = _mm256_set1_epi64x(b);
278 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
279
280 si64* dst = aug->i64;
281 const si64* src1 = sig->i64, * src2 = other->i64;
282 // The general definition of the wavelet in Part 2 is slightly
283 // different to part 2, although they are mathematically equivalent
284 // here, we identify the simpler form from Part 1 and employ them
285 if (a == 1)
286 { // 5/3 update and any case with a == 1
287 int i = (int)repeat;
288 if (synthesis)
289 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
290 {
291 __m256i s1 = _mm256_load_si256((__m256i*)src1);
292 __m256i s2 = _mm256_load_si256((__m256i*)src2);
293 __m256i d = _mm256_load_si256((__m256i*)dst);
294 __m256i t = _mm256_add_epi64(s1, s2);
295 __m256i v = _mm256_add_epi64(vb, t);
296 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
297 d = _mm256_sub_epi64(d, w);
298 _mm256_store_si256((__m256i*)dst, d);
299 }
300 else
301 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
302 {
303 __m256i s1 = _mm256_load_si256((__m256i*)src1);
304 __m256i s2 = _mm256_load_si256((__m256i*)src2);
305 __m256i d = _mm256_load_si256((__m256i*)dst);
306 __m256i t = _mm256_add_epi64(s1, s2);
307 __m256i v = _mm256_add_epi64(vb, t);
308 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
309 d = _mm256_add_epi64(d, w);
310 _mm256_store_si256((__m256i*)dst, d);
311 }
312 }
313 else if (a == -1 && b == 1 && e == 1)
314 { // 5/3 predict
315 int i = (int)repeat;
316 if (synthesis)
317 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
318 {
319 __m256i s1 = _mm256_load_si256((__m256i*)src1);
320 __m256i s2 = _mm256_load_si256((__m256i*)src2);
321 __m256i d = _mm256_load_si256((__m256i*)dst);
322 __m256i t = _mm256_add_epi64(s1, s2);
323 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
324 d = _mm256_add_epi64(d, w);
325 _mm256_store_si256((__m256i*)dst, d);
326 }
327 else
328 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
329 {
330 __m256i s1 = _mm256_load_si256((__m256i*)src1);
331 __m256i s2 = _mm256_load_si256((__m256i*)src2);
332 __m256i d = _mm256_load_si256((__m256i*)dst);
333 __m256i t = _mm256_add_epi64(s1, s2);
334 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
335 d = _mm256_sub_epi64(d, w);
336 _mm256_store_si256((__m256i*)dst, d);
337 }
338 }
339 else if (a == -1)
340 { // any case with a == -1, which is not 5/3 predict
341 int i = (int)repeat;
342 if (synthesis)
343 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
344 {
345 __m256i s1 = _mm256_load_si256((__m256i*)src1);
346 __m256i s2 = _mm256_load_si256((__m256i*)src2);
347 __m256i d = _mm256_load_si256((__m256i*)dst);
348 __m256i t = _mm256_add_epi64(s1, s2);
349 __m256i v = _mm256_sub_epi64(vb, t);
350 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
351 d = _mm256_sub_epi64(d, w);
352 _mm256_store_si256((__m256i*)dst, d);
353 }
354 else
355 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
356 {
357 __m256i s1 = _mm256_load_si256((__m256i*)src1);
358 __m256i s2 = _mm256_load_si256((__m256i*)src2);
359 __m256i d = _mm256_load_si256((__m256i*)dst);
360 __m256i t = _mm256_add_epi64(s1, s2);
361 __m256i v = _mm256_sub_epi64(vb, t);
362 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
363 d = _mm256_add_epi64(d, w);
364 _mm256_store_si256((__m256i*)dst, d);
365 }
366 }
367 else { // general case
368 // 64bit multiplication is not supported in avx2;
369 // in particular, _mm256_mullo_epi64.
370 if (synthesis)
371 for (ui32 i = repeat; i > 0; --i)
372 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
373 else
374 for (ui32 i = repeat; i > 0; --i)
375 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
376 }
377 }
378
380 void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig,
381 const line_buf* other, const line_buf* aug,
382 ui32 repeat, bool synthesis)
383 {
384 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
385 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
386 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
387 {
388 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
389 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
390 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
391 avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
392 }
393 else
394 {
395 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
396 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
397 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
398 avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
399 }
400 }
401
403 static
404 void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
405 const line_buf* hdst, const line_buf* src,
406 ui32 width, bool even)
407 {
408 if (width > 1)
409 {
410 // split src into ldst and hdst
411 {
412 float* dpl = even ? ldst->f32 : hdst->f32;
413 float* dph = even ? hdst->f32 : ldst->f32;
414 float* sp = src->f32;
415 int w = (int)width;
416 avx2_deinterleave32(dpl, dph, sp, w);
417 }
418
419 si32* hp = hdst->i32, * lp = ldst->i32;
420 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
421 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
422 ui32 num_steps = atk->get_num_steps();
423 for (ui32 j = num_steps; j > 0; --j)
424 {
425 // first lifting step
426 const lifting_step* s = atk->get_step(j - 1);
427 const si32 a = s->rev.Aatk;
428 const si32 b = s->rev.Batk;
429 const ui8 e = s->rev.Eatk;
430 __m256i va = _mm256_set1_epi32(a);
431 __m256i vb = _mm256_set1_epi32(b);
432
433 // extension
434 lp[-1] = lp[0];
435 lp[l_width] = lp[l_width - 1];
436 // lifting step
437 const si32* sp = lp;
438 si32* dp = hp;
439 if (a == 1)
440 { // 5/3 update and any case with a == 1
441 int i = (int)h_width;
442 if (even)
443 {
444 for (; i > 0; i -= 8, sp += 8, dp += 8)
445 {
446 __m256i s1 = _mm256_load_si256((__m256i*)sp);
447 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
448 __m256i d = _mm256_load_si256((__m256i*)dp);
449 __m256i t = _mm256_add_epi32(s1, s2);
450 __m256i v = _mm256_add_epi32(vb, t);
451 __m256i w = _mm256_srai_epi32(v, e);
452 d = _mm256_add_epi32(d, w);
453 _mm256_store_si256((__m256i*)dp, d);
454 }
455 }
456 else
457 {
458 for (; i > 0; i -= 8, sp += 8, dp += 8)
459 {
460 __m256i s1 = _mm256_load_si256((__m256i*)sp);
461 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
462 __m256i d = _mm256_load_si256((__m256i*)dp);
463 __m256i t = _mm256_add_epi32(s1, s2);
464 __m256i v = _mm256_add_epi32(vb, t);
465 __m256i w = _mm256_srai_epi32(v, e);
466 d = _mm256_add_epi32(d, w);
467 _mm256_store_si256((__m256i*)dp, d);
468 }
469 }
470 }
471 else if (a == -1 && b == 1 && e == 1)
472 { // 5/3 predict
473 int i = (int)h_width;
474 if (even)
475 for (; i > 0; i -= 8, sp += 8, dp += 8)
476 {
477 __m256i s1 = _mm256_load_si256((__m256i*)sp);
478 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
479 __m256i d = _mm256_load_si256((__m256i*)dp);
480 __m256i t = _mm256_add_epi32(s1, s2);
481 __m256i w = _mm256_srai_epi32(t, e);
482 d = _mm256_sub_epi32(d, w);
483 _mm256_store_si256((__m256i*)dp, d);
484 }
485 else
486 for (; i > 0; i -= 8, sp += 8, dp += 8)
487 {
488 __m256i s1 = _mm256_load_si256((__m256i*)sp);
489 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
490 __m256i d = _mm256_load_si256((__m256i*)dp);
491 __m256i t = _mm256_add_epi32(s1, s2);
492 __m256i w = _mm256_srai_epi32(t, e);
493 d = _mm256_sub_epi32(d, w);
494 _mm256_store_si256((__m256i*)dp, d);
495 }
496 }
497 else if (a == -1)
498 { // any case with a == -1, which is not 5/3 predict
499 int i = (int)h_width;
500 if (even)
501 for (; i > 0; i -= 8, sp += 8, dp += 8)
502 {
503 __m256i s1 = _mm256_load_si256((__m256i*)sp);
504 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
505 __m256i d = _mm256_load_si256((__m256i*)dp);
506 __m256i t = _mm256_add_epi32(s1, s2);
507 __m256i v = _mm256_sub_epi32(vb, t);
508 __m256i w = _mm256_srai_epi32(v, e);
509 d = _mm256_add_epi32(d, w);
510 _mm256_store_si256((__m256i*)dp, d);
511 }
512 else
513 for (; i > 0; i -= 8, sp += 8, dp += 8)
514 {
515 __m256i s1 = _mm256_load_si256((__m256i*)sp);
516 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
517 __m256i d = _mm256_load_si256((__m256i*)dp);
518 __m256i t = _mm256_add_epi32(s1, s2);
519 __m256i v = _mm256_sub_epi32(vb, t);
520 __m256i w = _mm256_srai_epi32(v, e);
521 d = _mm256_add_epi32(d, w);
522 _mm256_store_si256((__m256i*)dp, d);
523 }
524 }
525 else {
526 // general case
527 int i = (int)h_width;
528 if (even)
529 for (; i > 0; i -= 8, sp += 8, dp += 8)
530 {
531 __m256i s1 = _mm256_load_si256((__m256i*)sp);
532 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
533 __m256i d = _mm256_load_si256((__m256i*)dp);
534 __m256i t = _mm256_add_epi32(s1, s2);
535 __m256i u = _mm256_mullo_epi32(va, t);
536 __m256i v = _mm256_add_epi32(vb, u);
537 __m256i w = _mm256_srai_epi32(v, e);
538 d = _mm256_add_epi32(d, w);
539 _mm256_store_si256((__m256i*)dp, d);
540 }
541 else
542 for (; i > 0; i -= 8, sp += 8, dp += 8)
543 {
544 __m256i s1 = _mm256_load_si256((__m256i*)sp);
545 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
546 __m256i d = _mm256_load_si256((__m256i*)dp);
547 __m256i t = _mm256_add_epi32(s1, s2);
548 __m256i u = _mm256_mullo_epi32(va, t);
549 __m256i v = _mm256_add_epi32(vb, u);
550 __m256i w = _mm256_srai_epi32(v, e);
551 d = _mm256_add_epi32(d, w);
552 _mm256_store_si256((__m256i*)dp, d);
553 }
554 }
555
556 // swap buffers
557 si32* t = lp; lp = hp; hp = t;
558 even = !even;
559 ui32 w = l_width; l_width = h_width; h_width = w;
560 }
561 }
562 else {
563 if (even)
564 ldst->i32[0] = src->i32[0];
565 else
566 hdst->i32[0] = src->i32[0] << 1;
567 }
568 }
569
571 static
572 void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
573 const line_buf* hdst, const line_buf* src,
574 ui32 width, bool even)
575 {
576 if (width > 1)
577 {
578 // split src into ldst and hdst
579 {
580 double* dpl = (double*)(even ? ldst->p : hdst->p);
581 double* dph = (double*)(even ? hdst->p : ldst->p);
582 double* sp = (double*)src->p;
583 int w = (int)width;
584 avx2_deinterleave64(dpl, dph, sp, w);
585 }
586
587 si64* hp = hdst->i64, * lp = ldst->i64;
588 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
589 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
590 ui32 num_steps = atk->get_num_steps();
591 for (ui32 j = num_steps; j > 0; --j)
592 {
593 // first lifting step
594 const lifting_step* s = atk->get_step(j - 1);
595 const si32 a = s->rev.Aatk;
596 const si32 b = s->rev.Batk;
597 const ui8 e = s->rev.Eatk;
598 __m256i vb = _mm256_set1_epi64x(b);
599 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
600
601 // extension
602 lp[-1] = lp[0];
603 lp[l_width] = lp[l_width - 1];
604 // lifting step
605 const si64* sp = lp;
606 si64* dp = hp;
607 if (a == 1)
608 { // 5/3 update and any case with a == 1
609 int i = (int)h_width;
610 if (even)
611 {
612 for (; i > 0; i -= 4, sp += 4, dp += 4)
613 {
614 __m256i s1 = _mm256_load_si256((__m256i*)sp);
615 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
616 __m256i d = _mm256_load_si256((__m256i*)dp);
617 __m256i t = _mm256_add_epi64(s1, s2);
618 __m256i v = _mm256_add_epi64(vb, t);
619 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
620 d = _mm256_add_epi64(d, w);
621 _mm256_store_si256((__m256i*)dp, d);
622 }
623 }
624 else
625 {
626 for (; i > 0; i -= 4, sp += 4, dp += 4)
627 {
628 __m256i s1 = _mm256_load_si256((__m256i*)sp);
629 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
630 __m256i d = _mm256_load_si256((__m256i*)dp);
631 __m256i t = _mm256_add_epi64(s1, s2);
632 __m256i v = _mm256_add_epi64(vb, t);
633 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
634 d = _mm256_add_epi64(d, w);
635 _mm256_store_si256((__m256i*)dp, d);
636 }
637 }
638 }
639 else if (a == -1 && b == 1 && e == 1)
640 { // 5/3 predict
641 int i = (int)h_width;
642 if (even)
643 for (; i > 0; i -= 4, sp += 4, dp += 4)
644 {
645 __m256i s1 = _mm256_load_si256((__m256i*)sp);
646 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
647 __m256i d = _mm256_load_si256((__m256i*)dp);
648 __m256i t = _mm256_add_epi64(s1, s2);
649 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
650 d = _mm256_sub_epi64(d, w);
651 _mm256_store_si256((__m256i*)dp, d);
652 }
653 else
654 for (; i > 0; i -= 4, sp += 4, dp += 4)
655 {
656 __m256i s1 = _mm256_load_si256((__m256i*)sp);
657 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
658 __m256i d = _mm256_load_si256((__m256i*)dp);
659 __m256i t = _mm256_add_epi64(s1, s2);
660 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
661 d = _mm256_sub_epi64(d, w);
662 _mm256_store_si256((__m256i*)dp, d);
663 }
664 }
665 else if (a == -1)
666 { // any case with a == -1, which is not 5/3 predict
667 int i = (int)h_width;
668 if (even)
669 for (; i > 0; i -= 4, sp += 4, dp += 4)
670 {
671 __m256i s1 = _mm256_load_si256((__m256i*)sp);
672 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
673 __m256i d = _mm256_load_si256((__m256i*)dp);
674 __m256i t = _mm256_add_epi64(s1, s2);
675 __m256i v = _mm256_sub_epi64(vb, t);
676 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
677 d = _mm256_add_epi64(d, w);
678 _mm256_store_si256((__m256i*)dp, d);
679 }
680 else
681 for (; i > 0; i -= 4, sp += 4, dp += 4)
682 {
683 __m256i s1 = _mm256_load_si256((__m256i*)sp);
684 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
685 __m256i d = _mm256_load_si256((__m256i*)dp);
686 __m256i t = _mm256_add_epi64(s1, s2);
687 __m256i v = _mm256_sub_epi64(vb, t);
688 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
689 d = _mm256_add_epi64(d, w);
690 _mm256_store_si256((__m256i*)dp, d);
691 }
692 }
693 else {
694 // general case
695 // 64bit multiplication is not supported in avx2;
696 // in particular, _mm256_mullo_epi64.
697 if (even)
698 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
699 *dp += (b + a * (sp[0] + sp[1])) >> e;
700 else
701 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
702 *dp += (b + a * (sp[-1] + sp[0])) >> e;
703 }
704
705 // swap buffers
706 si64* t = lp; lp = hp; hp = t;
707 even = !even;
708 ui32 w = l_width; l_width = h_width; h_width = w;
709 }
710 }
711 else {
712 if (even)
713 ldst->i64[0] = src->i64[0];
714 else
715 hdst->i64[0] = src->i64[0] << 1;
716 }
717 }
718
720 void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
721 const line_buf* hdst, const line_buf* src,
722 ui32 width, bool even)
723 {
724 if (src->flags & line_buf::LFT_32BIT)
725 {
726 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
727 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
728 avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
729 }
730 else
731 {
732 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
733 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
734 (src == NULL || src->flags & line_buf::LFT_64BIT));
735 avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
736 }
737 }
738
740 static
741 void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
742 const line_buf* lsrc, const line_buf* hsrc,
743 ui32 width, bool even)
744 {
745 if (width > 1)
746 {
747 bool ev = even;
748 si32* oth = hsrc->i32, * aug = lsrc->i32;
749 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
750 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
751 ui32 num_steps = atk->get_num_steps();
752 for (ui32 j = 0; j < num_steps; ++j)
753 {
754 const lifting_step* s = atk->get_step(j);
755 const si32 a = s->rev.Aatk;
756 const si32 b = s->rev.Batk;
757 const ui8 e = s->rev.Eatk;
758 __m256i va = _mm256_set1_epi32(a);
759 __m256i vb = _mm256_set1_epi32(b);
760
761 // extension
762 oth[-1] = oth[0];
763 oth[oth_width] = oth[oth_width - 1];
764 // lifting step
765 const si32* sp = oth;
766 si32* dp = aug;
767 if (a == 1)
768 { // 5/3 update and any case with a == 1
769 int i = (int)aug_width;
770 if (ev)
771 {
772 for (; i > 0; i -= 8, sp += 8, dp += 8)
773 {
774 __m256i s1 = _mm256_load_si256((__m256i*)sp);
775 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
776 __m256i d = _mm256_load_si256((__m256i*)dp);
777 __m256i t = _mm256_add_epi32(s1, s2);
778 __m256i v = _mm256_add_epi32(vb, t);
779 __m256i w = _mm256_srai_epi32(v, e);
780 d = _mm256_sub_epi32(d, w);
781 _mm256_store_si256((__m256i*)dp, d);
782 }
783 }
784 else
785 {
786 for (; i > 0; i -= 8, sp += 8, dp += 8)
787 {
788 __m256i s1 = _mm256_load_si256((__m256i*)sp);
789 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
790 __m256i d = _mm256_load_si256((__m256i*)dp);
791 __m256i t = _mm256_add_epi32(s1, s2);
792 __m256i v = _mm256_add_epi32(vb, t);
793 __m256i w = _mm256_srai_epi32(v, e);
794 d = _mm256_sub_epi32(d, w);
795 _mm256_store_si256((__m256i*)dp, d);
796 }
797 }
798 }
799 else if (a == -1 && b == 1 && e == 1)
800 { // 5/3 predict
801 int i = (int)aug_width;
802 if (ev)
803 for (; i > 0; i -= 8, sp += 8, dp += 8)
804 {
805 __m256i s1 = _mm256_load_si256((__m256i*)sp);
806 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
807 __m256i d = _mm256_load_si256((__m256i*)dp);
808 __m256i t = _mm256_add_epi32(s1, s2);
809 __m256i w = _mm256_srai_epi32(t, e);
810 d = _mm256_add_epi32(d, w);
811 _mm256_store_si256((__m256i*)dp, d);
812 }
813 else
814 for (; i > 0; i -= 8, sp += 8, dp += 8)
815 {
816 __m256i s1 = _mm256_load_si256((__m256i*)sp);
817 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
818 __m256i d = _mm256_load_si256((__m256i*)dp);
819 __m256i t = _mm256_add_epi32(s1, s2);
820 __m256i w = _mm256_srai_epi32(t, e);
821 d = _mm256_add_epi32(d, w);
822 _mm256_store_si256((__m256i*)dp, d);
823 }
824 }
825 else if (a == -1)
826 { // any case with a == -1, which is not 5/3 predict
827 int i = (int)aug_width;
828 if (ev)
829 for (; i > 0; i -= 8, sp += 8, dp += 8)
830 {
831 __m256i s1 = _mm256_load_si256((__m256i*)sp);
832 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
833 __m256i d = _mm256_load_si256((__m256i*)dp);
834 __m256i t = _mm256_add_epi32(s1, s2);
835 __m256i v = _mm256_sub_epi32(vb, t);
836 __m256i w = _mm256_srai_epi32(v, e);
837 d = _mm256_sub_epi32(d, w);
838 _mm256_store_si256((__m256i*)dp, d);
839 }
840 else
841 for (; i > 0; i -= 8, sp += 8, dp += 8)
842 {
843 __m256i s1 = _mm256_load_si256((__m256i*)sp);
844 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
845 __m256i d = _mm256_load_si256((__m256i*)dp);
846 __m256i t = _mm256_add_epi32(s1, s2);
847 __m256i v = _mm256_sub_epi32(vb, t);
848 __m256i w = _mm256_srai_epi32(v, e);
849 d = _mm256_sub_epi32(d, w);
850 _mm256_store_si256((__m256i*)dp, d);
851 }
852 }
853 else {
854 // general case
855 int i = (int)aug_width;
856 if (ev)
857 for (; i > 0; i -= 8, sp += 8, dp += 8)
858 {
859 __m256i s1 = _mm256_load_si256((__m256i*)sp);
860 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
861 __m256i d = _mm256_load_si256((__m256i*)dp);
862 __m256i t = _mm256_add_epi32(s1, s2);
863 __m256i u = _mm256_mullo_epi32(va, t);
864 __m256i v = _mm256_add_epi32(vb, u);
865 __m256i w = _mm256_srai_epi32(v, e);
866 d = _mm256_sub_epi32(d, w);
867 _mm256_store_si256((__m256i*)dp, d);
868 }
869 else
870 for (; i > 0; i -= 8, sp += 8, dp += 8)
871 {
872 __m256i s1 = _mm256_load_si256((__m256i*)sp);
873 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
874 __m256i d = _mm256_load_si256((__m256i*)dp);
875 __m256i t = _mm256_add_epi32(s1, s2);
876 __m256i u = _mm256_mullo_epi32(va, t);
877 __m256i v = _mm256_add_epi32(vb, u);
878 __m256i w = _mm256_srai_epi32(v, e);
879 d = _mm256_sub_epi32(d, w);
880 _mm256_store_si256((__m256i*)dp, d);
881 }
882 }
883
884 // swap buffers
885 si32* t = aug; aug = oth; oth = t;
886 ev = !ev;
887 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
888 }
889
890 // combine both lsrc and hsrc into dst
891 {
892 float* dp = dst->f32;
893 float* spl = even ? lsrc->f32 : hsrc->f32;
894 float* sph = even ? hsrc->f32 : lsrc->f32;
895 int w = (int)width;
896 avx2_interleave32(dp, spl, sph, w);
897 }
898 }
899 else {
900 if (even)
901 dst->i32[0] = lsrc->i32[0];
902 else
903 dst->i32[0] = hsrc->i32[0] >> 1;
904 }
905 }
906
908 static
909 void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
910 const line_buf* lsrc, const line_buf* hsrc,
911 ui32 width, bool even)
912 {
913 if (width > 1)
914 {
915 bool ev = even;
916 si64* oth = hsrc->i64, * aug = lsrc->i64;
917 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
918 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
919 ui32 num_steps = atk->get_num_steps();
920 for (ui32 j = 0; j < num_steps; ++j)
921 {
922 const lifting_step* s = atk->get_step(j);
923 const si32 a = s->rev.Aatk;
924 const si32 b = s->rev.Batk;
925 const ui8 e = s->rev.Eatk;
926 __m256i vb = _mm256_set1_epi64x(b);
927 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
928
929 // extension
930 oth[-1] = oth[0];
931 oth[oth_width] = oth[oth_width - 1];
932 // lifting step
933 const si64* sp = oth;
934 si64* dp = aug;
935 if (a == 1)
936 { // 5/3 update and any case with a == 1
937 int i = (int)aug_width;
938 if (ev)
939 {
940 for (; i > 0; i -= 4, sp += 4, dp += 4)
941 {
942 __m256i s1 = _mm256_load_si256((__m256i*)sp);
943 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
944 __m256i d = _mm256_load_si256((__m256i*)dp);
945 __m256i t = _mm256_add_epi64(s1, s2);
946 __m256i v = _mm256_add_epi64(vb, t);
947 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
948 d = _mm256_sub_epi64(d, w);
949 _mm256_store_si256((__m256i*)dp, d);
950 }
951 }
952 else
953 {
954 for (; i > 0; i -= 4, sp += 4, dp += 4)
955 {
956 __m256i s1 = _mm256_load_si256((__m256i*)sp);
957 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
958 __m256i d = _mm256_load_si256((__m256i*)dp);
959 __m256i t = _mm256_add_epi64(s1, s2);
960 __m256i v = _mm256_add_epi64(vb, t);
961 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
962 d = _mm256_sub_epi64(d, w);
963 _mm256_store_si256((__m256i*)dp, d);
964 }
965 }
966 }
967 else if (a == -1 && b == 1 && e == 1)
968 { // 5/3 predict
969 int i = (int)aug_width;
970 if (ev)
971 for (; i > 0; i -= 4, sp += 4, dp += 4)
972 {
973 __m256i s1 = _mm256_load_si256((__m256i*)sp);
974 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
975 __m256i d = _mm256_load_si256((__m256i*)dp);
976 __m256i t = _mm256_add_epi64(s1, s2);
977 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
978 d = _mm256_add_epi64(d, w);
979 _mm256_store_si256((__m256i*)dp, d);
980 }
981 else
982 for (; i > 0; i -= 4, sp += 4, dp += 4)
983 {
984 __m256i s1 = _mm256_load_si256((__m256i*)sp);
985 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
986 __m256i d = _mm256_load_si256((__m256i*)dp);
987 __m256i t = _mm256_add_epi64(s1, s2);
988 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
989 d = _mm256_add_epi64(d, w);
990 _mm256_store_si256((__m256i*)dp, d);
991 }
992 }
993 else if (a == -1)
994 { // any case with a == -1, which is not 5/3 predict
995 int i = (int)aug_width;
996 if (ev)
997 for (; i > 0; i -= 4, sp += 4, dp += 4)
998 {
999 __m256i s1 = _mm256_load_si256((__m256i*)sp);
1000 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
1001 __m256i d = _mm256_load_si256((__m256i*)dp);
1002 __m256i t = _mm256_add_epi64(s1, s2);
1003 __m256i v = _mm256_sub_epi64(vb, t);
1004 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
1005 d = _mm256_sub_epi64(d, w);
1006 _mm256_store_si256((__m256i*)dp, d);
1007 }
1008 else
1009 for (; i > 0; i -= 4, sp += 4, dp += 4)
1010 {
1011 __m256i s1 = _mm256_load_si256((__m256i*)sp);
1012 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
1013 __m256i d = _mm256_load_si256((__m256i*)dp);
1014 __m256i t = _mm256_add_epi64(s1, s2);
1015 __m256i v = _mm256_sub_epi64(vb, t);
1016 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
1017 d = _mm256_sub_epi64(d, w);
1018 _mm256_store_si256((__m256i*)dp, d);
1019 }
1020 }
1021 else {
1022 // general case
1023 // 64bit multiplication is not supported in avx2;
1024 // in particular, _mm_mullo_epi64.
1025 if (ev)
1026 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
1027 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
1028 else
1029 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
1030 *dp -= (b + a * (sp[0] + sp[1])) >> e;
1031 }
1032
1033 // swap buffers
1034 si64* t = aug; aug = oth; oth = t;
1035 ev = !ev;
1036 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1037 }
1038
1039 // combine both lsrc and hsrc into dst
1040 {
1041 double* dp = (double*)dst->p;
1042 double* spl = (double*)(even ? lsrc->p : hsrc->p);
1043 double* sph = (double*)(even ? hsrc->p : lsrc->p);
1044 int w = (int)width;
1045 avx2_interleave64(dp, spl, sph, w);
1046 }
1047 }
1048 else {
1049 if (even)
1050 dst->i64[0] = lsrc->i64[0];
1051 else
1052 dst->i64[0] = hsrc->i64[0] >> 1;
1053 }
1054 }
1055
1057 void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
1058 const line_buf* lsrc, const line_buf* hsrc,
1059 ui32 width, bool even)
1060 {
1061 if (dst->flags & line_buf::LFT_32BIT)
1062 {
1063 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
1064 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
1065 avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1066 }
1067 else
1068 {
1069 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
1070 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
1071 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
1072 avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
1073 }
1074 }
1075
1076 } // !local
1077} // !ojph
1078
1079#endif
void avx2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50