39#if defined(OJPH_ARCH_X86_64)
60 void avx512_deinterleave32(
float* dpl,
float* dph,
float* sp,
int width)
62 __m512i idx1 = _mm512_set_epi32(
63 0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10,
64 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
66 __m512i idx2 = _mm512_set_epi32(
67 0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11,
68 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
70 for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
72 __m512 a = _mm512_load_ps(sp);
73 __m512 b = _mm512_load_ps(sp + 16);
74 __m512 c = _mm512_permutex2var_ps(a, idx1, b);
75 __m512 d = _mm512_permutex2var_ps(a, idx2, b);
76 _mm512_store_ps(dpl, c);
77 _mm512_store_ps(dph, d);
79 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
81 __m256 a = _mm256_load_ps(sp);
82 __m256 b = _mm256_load_ps(sp + 8);
83 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
84 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
85 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
86 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
87 _mm256_store_ps(dpl, e);
88 _mm256_store_ps(dph, f);
96 void avx512_interleave32(
float* dp,
float* spl,
float* sph,
int width)
98 __m512i idx1 = _mm512_set_epi32(
99 0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4,
100 0x13, 0x3, 0x12, 0x2, 0x11, 0x1, 0x10, 0x0
102 __m512i idx2 = _mm512_set_epi32(
103 0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC,
104 0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8
106 for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
108 __m512 a = _mm512_load_ps(spl);
109 __m512 b = _mm512_load_ps(sph);
110 __m512 c = _mm512_permutex2var_ps(a, idx1, b);
111 __m512 d = _mm512_permutex2var_ps(a, idx2, b);
112 _mm512_store_ps(dp, c);
113 _mm512_store_ps(dp + 16, d);
115 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
117 __m256 a = _mm256_load_ps(spl);
118 __m256 b = _mm256_load_ps(sph);
119 __m256 c = _mm256_unpacklo_ps(a, b);
120 __m256 d = _mm256_unpackhi_ps(a, b);
121 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
122 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
123 _mm256_store_ps(dp, e);
124 _mm256_store_ps(dp + 8, f);
131 static void avx512_deinterleave64(
double* dpl,
double* dph,
double* sp,
134 __m512i idx1 = _mm512_set_epi64(
135 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
137 __m512i idx2 = _mm512_set_epi64(
138 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
140 for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)
142 __m512d a = _mm512_load_pd(sp);
143 __m512d b = _mm512_load_pd(sp + 16);
144 __m512d c = _mm512_permutex2var_pd(a, idx1, b);
145 __m512d d = _mm512_permutex2var_pd(a, idx2, b);
146 _mm512_store_pd(dpl, c);
147 _mm512_store_pd(dph, d);
149 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
151 __m256d a = _mm256_load_pd(sp);
152 __m256d b = _mm256_load_pd(sp + 4);
153 __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
154 __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
155 __m256d e = _mm256_shuffle_pd(c, d, 0x0);
156 __m256d f = _mm256_shuffle_pd(c, d, 0xF);
157 _mm256_store_pd(dpl, e);
158 _mm256_store_pd(dph, f);
165 static void avx512_interleave64(
double* dp,
double* spl,
double* sph,
168 __m512i idx1 = _mm512_set_epi64(
169 0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0
171 __m512i idx2 = _mm512_set_epi64(
172 0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4
174 for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)
176 __m512d a = _mm512_load_pd(spl);
177 __m512d b = _mm512_load_pd(sph);
178 __m512d c = _mm512_permutex2var_pd(a, idx1, b);
179 __m512d d = _mm512_permutex2var_pd(a, idx2, b);
180 _mm512_store_pd(dp, c);
181 _mm512_store_pd(dp + 16, d);
183 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
185 __m256d a = _mm256_load_pd(spl);
186 __m256d b = _mm256_load_pd(sph);
187 __m256d c = _mm256_unpacklo_pd(a, b);
188 __m256d d = _mm256_unpackhi_pd(a, b);
189 __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
190 __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
191 _mm256_store_pd(dp, e);
192 _mm256_store_pd(dp + 4, f);
197 static inline void avx512_multiply_const(
float* p,
float f,
int width)
199 __m512 factor = _mm512_set1_ps(f);
200 for (; width > 0; width -= 16, p += 16)
202 __m512 s = _mm512_load_ps(p);
203 _mm512_store_ps(p, _mm512_mul_ps(factor, s));
209 const line_buf* other,
const line_buf* aug,
210 ui32 repeat,
bool synthesis)
212 float a = s->irv.Aatk;
216 __m512 factor = _mm512_set1_ps(a);
218 float* dst = aug->f32;
219 const float* src1 = sig->f32, * src2 = other->f32;
221 for ( ; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
223 __m512 s1 = _mm512_load_ps(src1);
224 __m512 s2 = _mm512_load_ps(src2);
225 __m512 d = _mm512_load_ps(dst);
226 d = _mm512_add_ps(d, _mm512_mul_ps(factor, _mm512_add_ps(s1, s2)));
227 _mm512_store_ps(dst, d);
234 avx512_multiply_const(aug->f32, K, (
int)repeat);
239 const line_buf* hdst,
const line_buf* src,
240 ui32 width,
bool even)
246 float* dpl = even ? ldst->f32 : hdst->f32;
247 float* dph = even ? hdst->f32 : ldst->f32;
248 float* sp = src->f32;
250 avx512_deinterleave32(dpl, dph, sp, w);
254 float* hp = hdst->f32, * lp = ldst->f32;
255 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
256 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
257 ui32 num_steps = atk->get_num_steps();
258 for (
ui32 j = num_steps; j > 0; --j)
265 lp[l_width] = lp[l_width - 1];
267 const float* sp = lp;
269 int i = (int)h_width;
270 __m512 f = _mm512_set1_ps(a);
273 for (; i > 0; i -= 16, sp += 16, dp += 16)
275 __m512 m = _mm512_load_ps(sp);
276 __m512 n = _mm512_loadu_ps(sp + 1);
277 __m512 p = _mm512_load_ps(dp);
278 p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
279 _mm512_store_ps(dp, p);
284 for (; i > 0; i -= 16, sp += 16, dp += 16)
286 __m512 m = _mm512_load_ps(sp);
287 __m512 n = _mm512_loadu_ps(sp - 1);
288 __m512 p = _mm512_load_ps(dp);
289 p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
290 _mm512_store_ps(dp, p);
295 float* t = lp; lp = hp; hp = t;
297 ui32 w = l_width; l_width = h_width; h_width = w;
301 float K = atk->get_K();
302 float K_inv = 1.0f / K;
303 avx512_multiply_const(lp, K_inv, (
int)l_width);
304 avx512_multiply_const(hp, K, (
int)h_width);
309 ldst->f32[0] = src->f32[0];
311 hdst->f32[0] = src->f32[0] * 2.0f;
317 const line_buf* lsrc,
const line_buf* hsrc,
318 ui32 width,
bool even)
323 float* oth = hsrc->f32, * aug = lsrc->f32;
324 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
325 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
328 float K = atk->get_K();
329 float K_inv = 1.0f / K;
330 avx512_multiply_const(aug, K, (
int)aug_width);
331 avx512_multiply_const(oth, K_inv, (
int)oth_width);
335 ui32 num_steps = atk->get_num_steps();
336 for (
ui32 j = 0; j < num_steps; ++j)
343 oth[oth_width] = oth[oth_width - 1];
345 const float* sp = oth;
347 int i = (int)aug_width;
348 __m512 f = _mm512_set1_ps(a);
351 for (; i > 0; i -= 16, sp += 16, dp += 16)
353 __m512 m = _mm512_load_ps(sp);
354 __m512 n = _mm512_loadu_ps(sp - 1);
355 __m512 p = _mm512_load_ps(dp);
356 p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
357 _mm512_store_ps(dp, p);
362 for (; i > 0; i -= 16, sp += 16, dp += 16)
364 __m512 m = _mm512_load_ps(sp);
365 __m512 n = _mm512_loadu_ps(sp + 1);
366 __m512 p = _mm512_load_ps(dp);
367 p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
368 _mm512_store_ps(dp, p);
373 float* t = aug; aug = oth; oth = t;
375 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
380 float* dp = dst->f32;
381 float* spl = even ? lsrc->f32 : hsrc->f32;
382 float* sph = even ? hsrc->f32 : lsrc->f32;
384 avx512_interleave32(dp, spl, sph, w);
389 dst->f32[0] = lsrc->f32[0];
391 dst->f32[0] = hsrc->f32[0] * 0.5f;
397 void avx512_rev_vert_step32(
const lifting_step* s,
const line_buf* sig,
398 const line_buf* other,
const line_buf* aug,
399 ui32 repeat,
bool synthesis)
401 const si32 a = s->rev.Aatk;
402 const si32 b = s->rev.Batk;
403 const ui8 e = s->rev.Eatk;
404 __m512i va = _mm512_set1_epi32(a);
405 __m512i vb = _mm512_set1_epi32(b);
407 si32* dst = aug->i32;
408 const si32* src1 = sig->i32, * src2 = other->i32;
416 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
418 __m512i s1 = _mm512_load_si512((__m512i*)src1);
419 __m512i s2 = _mm512_load_si512((__m512i*)src2);
420 __m512i d = _mm512_load_si512((__m512i*)dst);
421 __m512i t = _mm512_add_epi32(s1, s2);
422 __m512i v = _mm512_add_epi32(vb, t);
423 __m512i w = _mm512_srai_epi32(v, e);
424 d = _mm512_sub_epi32(d, w);
425 _mm512_store_si512((__m512i*)dst, d);
428 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
430 __m512i s1 = _mm512_load_si512((__m512i*)src1);
431 __m512i s2 = _mm512_load_si512((__m512i*)src2);
432 __m512i d = _mm512_load_si512((__m512i*)dst);
433 __m512i t = _mm512_add_epi32(s1, s2);
434 __m512i v = _mm512_add_epi32(vb, t);
435 __m512i w = _mm512_srai_epi32(v, e);
436 d = _mm512_add_epi32(d, w);
437 _mm512_store_si512((__m512i*)dst, d);
440 else if (a == -1 && b == 1 && e == 1)
444 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
446 __m512i s1 = _mm512_load_si512((__m512i*)src1);
447 __m512i s2 = _mm512_load_si512((__m512i*)src2);
448 __m512i d = _mm512_load_si512((__m512i*)dst);
449 __m512i t = _mm512_add_epi32(s1, s2);
450 __m512i w = _mm512_srai_epi32(t, e);
451 d = _mm512_add_epi32(d, w);
452 _mm512_store_si512((__m512i*)dst, d);
455 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
457 __m512i s1 = _mm512_load_si512((__m512i*)src1);
458 __m512i s2 = _mm512_load_si512((__m512i*)src2);
459 __m512i d = _mm512_load_si512((__m512i*)dst);
460 __m512i t = _mm512_add_epi32(s1, s2);
461 __m512i w = _mm512_srai_epi32(t, e);
462 d = _mm512_sub_epi32(d, w);
463 _mm512_store_si512((__m512i*)dst, d);
470 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
472 __m512i s1 = _mm512_load_si512((__m512i*)src1);
473 __m512i s2 = _mm512_load_si512((__m512i*)src2);
474 __m512i d = _mm512_load_si512((__m512i*)dst);
475 __m512i t = _mm512_add_epi32(s1, s2);
476 __m512i v = _mm512_sub_epi32(vb, t);
477 __m512i w = _mm512_srai_epi32(v, e);
478 d = _mm512_sub_epi32(d, w);
479 _mm512_store_si512((__m512i*)dst, d);
482 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
484 __m512i s1 = _mm512_load_si512((__m512i*)src1);
485 __m512i s2 = _mm512_load_si512((__m512i*)src2);
486 __m512i d = _mm512_load_si512((__m512i*)dst);
487 __m512i t = _mm512_add_epi32(s1, s2);
488 __m512i v = _mm512_sub_epi32(vb, t);
489 __m512i w = _mm512_srai_epi32(v, e);
490 d = _mm512_add_epi32(d, w);
491 _mm512_store_si512((__m512i*)dst, d);
497 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
499 __m512i s1 = _mm512_load_si512((__m512i*)src1);
500 __m512i s2 = _mm512_load_si512((__m512i*)src2);
501 __m512i d = _mm512_load_si512((__m512i*)dst);
502 __m512i t = _mm512_add_epi32(s1, s2);
503 __m512i u = _mm512_mullo_epi32(va, t);
504 __m512i v = _mm512_add_epi32(vb, u);
505 __m512i w = _mm512_srai_epi32(v, e);
506 d = _mm512_sub_epi32(d, w);
507 _mm512_store_si512((__m512i*)dst, d);
510 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
512 __m512i s1 = _mm512_load_si512((__m512i*)src1);
513 __m512i s2 = _mm512_load_si512((__m512i*)src2);
514 __m512i d = _mm512_load_si512((__m512i*)dst);
515 __m512i t = _mm512_add_epi32(s1, s2);
516 __m512i u = _mm512_mullo_epi32(va, t);
517 __m512i v = _mm512_add_epi32(vb, u);
518 __m512i w = _mm512_srai_epi32(v, e);
519 d = _mm512_add_epi32(d, w);
520 _mm512_store_si512((__m512i*)dst, d);
526 void avx512_rev_vert_step64(
const lifting_step* s,
const line_buf* sig,
527 const line_buf* other,
const line_buf* aug,
528 ui32 repeat,
bool synthesis)
530 const si32 a = s->rev.Aatk;
531 const si32 b = s->rev.Batk;
532 const ui8 e = s->rev.Eatk;
533 __m512i vb = _mm512_set1_epi64(b);
535 si64* dst = aug->i64;
536 const si64* src1 = sig->i64, * src2 = other->i64;
544 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
546 __m512i s1 = _mm512_load_si512((__m512i*)src1);
547 __m512i s2 = _mm512_load_si512((__m512i*)src2);
548 __m512i d = _mm512_load_si512((__m512i*)dst);
549 __m512i t = _mm512_add_epi64(s1, s2);
550 __m512i v = _mm512_add_epi64(vb, t);
551 __m512i w = _mm512_srai_epi64(v, e);
552 d = _mm512_sub_epi64(d, w);
553 _mm512_store_si512((__m512i*)dst, d);
556 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
558 __m512i s1 = _mm512_load_si512((__m512i*)src1);
559 __m512i s2 = _mm512_load_si512((__m512i*)src2);
560 __m512i d = _mm512_load_si512((__m512i*)dst);
561 __m512i t = _mm512_add_epi64(s1, s2);
562 __m512i v = _mm512_add_epi64(vb, t);
563 __m512i w = _mm512_srai_epi64(v, e);
564 d = _mm512_add_epi64(d, w);
565 _mm512_store_si512((__m512i*)dst, d);
568 else if (a == -1 && b == 1 && e == 1)
572 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
574 __m512i s1 = _mm512_load_si512((__m512i*)src1);
575 __m512i s2 = _mm512_load_si512((__m512i*)src2);
576 __m512i d = _mm512_load_si512((__m512i*)dst);
577 __m512i t = _mm512_add_epi64(s1, s2);
578 __m512i w = _mm512_srai_epi64(t, e);
579 d = _mm512_add_epi64(d, w);
580 _mm512_store_si512((__m512i*)dst, d);
583 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
585 __m512i s1 = _mm512_load_si512((__m512i*)src1);
586 __m512i s2 = _mm512_load_si512((__m512i*)src2);
587 __m512i d = _mm512_load_si512((__m512i*)dst);
588 __m512i t = _mm512_add_epi64(s1, s2);
589 __m512i w = _mm512_srai_epi64(t, e);
590 d = _mm512_sub_epi64(d, w);
591 _mm512_store_si512((__m512i*)dst, d);
598 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
600 __m512i s1 = _mm512_load_si512((__m512i*)src1);
601 __m512i s2 = _mm512_load_si512((__m512i*)src2);
602 __m512i d = _mm512_load_si512((__m512i*)dst);
603 __m512i t = _mm512_add_epi64(s1, s2);
604 __m512i v = _mm512_sub_epi64(vb, t);
605 __m512i w = _mm512_srai_epi64(v, e);
606 d = _mm512_sub_epi64(d, w);
607 _mm512_store_si512((__m512i*)dst, d);
610 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
612 __m512i s1 = _mm512_load_si512((__m512i*)src1);
613 __m512i s2 = _mm512_load_si512((__m512i*)src2);
614 __m512i d = _mm512_load_si512((__m512i*)dst);
615 __m512i t = _mm512_add_epi64(s1, s2);
616 __m512i v = _mm512_sub_epi64(vb, t);
617 __m512i w = _mm512_srai_epi64(v, e);
618 d = _mm512_add_epi64(d, w);
619 _mm512_store_si512((__m512i*)dst, d);
627 for (
ui32 i = repeat; i > 0; --i)
628 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
630 for (
ui32 i = repeat; i > 0; --i)
631 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
669 const line_buf* other,
const line_buf* aug,
670 ui32 repeat,
bool synthesis)
679 avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
686 avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
691 void avx512_rev_horz_ana32(
const param_atk* atk,
const line_buf* ldst,
692 const line_buf* hdst,
const line_buf* src,
693 ui32 width,
bool even)
699 float* dpl = even ? ldst->f32 : hdst->f32;
700 float* dph = even ? hdst->f32 : ldst->f32;
701 float* sp = src->f32;
703 avx512_deinterleave32(dpl, dph, sp, w);
706 si32* hp = hdst->i32, * lp = ldst->i32;
707 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
708 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
709 ui32 num_steps = atk->get_num_steps();
710 for (
ui32 j = num_steps; j > 0; --j)
715 const si32 b = s->rev.Batk;
716 const ui8 e = s->rev.Eatk;
717 __m512i va = _mm512_set1_epi32(a);
718 __m512i vb = _mm512_set1_epi32(b);
722 lp[l_width] = lp[l_width - 1];
728 int i = (int)h_width;
731 for (; i > 0; i -= 16, sp += 16, dp += 16)
733 __m512i s1 = _mm512_load_si512((__m512i*)sp);
734 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
735 __m512i d = _mm512_load_si512((__m512i*)dp);
736 __m512i t = _mm512_add_epi32(s1, s2);
737 __m512i v = _mm512_add_epi32(vb, t);
738 __m512i w = _mm512_srai_epi32(v, e);
739 d = _mm512_add_epi32(d, w);
740 _mm512_store_si512((__m512i*)dp, d);
745 for (; i > 0; i -= 16, sp += 16, dp += 16)
747 __m512i s1 = _mm512_load_si512((__m512i*)sp);
748 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
749 __m512i d = _mm512_load_si512((__m512i*)dp);
750 __m512i t = _mm512_add_epi32(s1, s2);
751 __m512i v = _mm512_add_epi32(vb, t);
752 __m512i w = _mm512_srai_epi32(v, e);
753 d = _mm512_add_epi32(d, w);
754 _mm512_store_si512((__m512i*)dp, d);
758 else if (a == -1 && b == 1 && e == 1)
760 int i = (int)h_width;
762 for (; i > 0; i -= 16, sp += 16, dp += 16)
764 __m512i s1 = _mm512_load_si512((__m512i*)sp);
765 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
766 __m512i d = _mm512_load_si512((__m512i*)dp);
767 __m512i t = _mm512_add_epi32(s1, s2);
768 __m512i w = _mm512_srai_epi32(t, e);
769 d = _mm512_sub_epi32(d, w);
770 _mm512_store_si512((__m512i*)dp, d);
773 for (; i > 0; i -= 16, sp += 16, dp += 16)
775 __m512i s1 = _mm512_load_si512((__m512i*)sp);
776 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
777 __m512i d = _mm512_load_si512((__m512i*)dp);
778 __m512i t = _mm512_add_epi32(s1, s2);
779 __m512i w = _mm512_srai_epi32(t, e);
780 d = _mm512_sub_epi32(d, w);
781 _mm512_store_si512((__m512i*)dp, d);
786 int i = (int)h_width;
788 for (; i > 0; i -= 16, sp += 16, dp += 16)
790 __m512i s1 = _mm512_load_si512((__m512i*)sp);
791 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
792 __m512i d = _mm512_load_si512((__m512i*)dp);
793 __m512i t = _mm512_add_epi32(s1, s2);
794 __m512i v = _mm512_sub_epi32(vb, t);
795 __m512i w = _mm512_srai_epi32(v, e);
796 d = _mm512_add_epi32(d, w);
797 _mm512_store_si512((__m512i*)dp, d);
800 for (; i > 0; i -= 16, sp += 16, dp += 16)
802 __m512i s1 = _mm512_load_si512((__m512i*)sp);
803 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
804 __m512i d = _mm512_load_si512((__m512i*)dp);
805 __m512i t = _mm512_add_epi32(s1, s2);
806 __m512i v = _mm512_sub_epi32(vb, t);
807 __m512i w = _mm512_srai_epi32(v, e);
808 d = _mm512_add_epi32(d, w);
809 _mm512_store_si512((__m512i*)dp, d);
814 int i = (int)h_width;
816 for (; i > 0; i -= 16, sp += 16, dp += 16)
818 __m512i s1 = _mm512_load_si512((__m512i*)sp);
819 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
820 __m512i d = _mm512_load_si512((__m512i*)dp);
821 __m512i t = _mm512_add_epi32(s1, s2);
822 __m512i u = _mm512_mullo_epi32(va, t);
823 __m512i v = _mm512_add_epi32(vb, u);
824 __m512i w = _mm512_srai_epi32(v, e);
825 d = _mm512_add_epi32(d, w);
826 _mm512_store_si512((__m512i*)dp, d);
829 for (; i > 0; i -= 16, sp += 16, dp += 16)
831 __m512i s1 = _mm512_load_si512((__m512i*)sp);
832 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
833 __m512i d = _mm512_load_si512((__m512i*)dp);
834 __m512i t = _mm512_add_epi32(s1, s2);
835 __m512i u = _mm512_mullo_epi32(va, t);
836 __m512i v = _mm512_add_epi32(vb, u);
837 __m512i w = _mm512_srai_epi32(v, e);
838 d = _mm512_add_epi32(d, w);
839 _mm512_store_si512((__m512i*)dp, d);
844 si32* t = lp; lp = hp; hp = t;
846 ui32 w = l_width; l_width = h_width; h_width = w;
851 ldst->i32[0] = src->i32[0];
853 hdst->i32[0] = src->i32[0] << 1;
858 void avx512_rev_horz_ana64(
const param_atk* atk,
const line_buf* ldst,
859 const line_buf* hdst,
const line_buf* src,
860 ui32 width,
bool even)
866 double* dpl = (
double*)(even ? ldst->p : hdst->p);
867 double* dph = (
double*)(even ? hdst->p : ldst->p);
868 double* sp = (
double*)(src->p);
870 avx512_deinterleave64(dpl, dph, sp, w);
873 si64* hp = hdst->i64, * lp = ldst->i64;
874 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
875 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
876 ui32 num_steps = atk->get_num_steps();
877 for (
ui32 j = num_steps; j > 0; --j)
882 const si32 b = s->rev.Batk;
883 const ui8 e = s->rev.Eatk;
884 __m512i vb = _mm512_set1_epi64(b);
888 lp[l_width] = lp[l_width - 1];
894 int i = (int)h_width;
897 for (; i > 0; i -= 8, sp += 8, dp += 8)
899 __m512i s1 = _mm512_load_si512((__m512i*)sp);
900 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
901 __m512i d = _mm512_load_si512((__m512i*)dp);
902 __m512i t = _mm512_add_epi64(s1, s2);
903 __m512i v = _mm512_add_epi64(vb, t);
904 __m512i w = _mm512_srai_epi64(v, e);
905 d = _mm512_add_epi64(d, w);
906 _mm512_store_si512((__m512i*)dp, d);
911 for (; i > 0; i -= 8, sp += 8, dp += 8)
913 __m512i s1 = _mm512_load_si512((__m512i*)sp);
914 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
915 __m512i d = _mm512_load_si512((__m512i*)dp);
916 __m512i t = _mm512_add_epi64(s1, s2);
917 __m512i v = _mm512_add_epi64(vb, t);
918 __m512i w = _mm512_srai_epi64(v, e);
919 d = _mm512_add_epi64(d, w);
920 _mm512_store_si512((__m512i*)dp, d);
924 else if (a == -1 && b == 1 && e == 1)
926 int i = (int)h_width;
928 for (; i > 0; i -= 8, sp += 8, dp += 8)
930 __m512i s1 = _mm512_load_si512((__m512i*)sp);
931 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
932 __m512i d = _mm512_load_si512((__m512i*)dp);
933 __m512i t = _mm512_add_epi64(s1, s2);
934 __m512i w = _mm512_srai_epi64(t, e);
935 d = _mm512_sub_epi64(d, w);
936 _mm512_store_si512((__m512i*)dp, d);
939 for (; i > 0; i -= 8, sp += 8, dp += 8)
941 __m512i s1 = _mm512_load_si512((__m512i*)sp);
942 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
943 __m512i d = _mm512_load_si512((__m512i*)dp);
944 __m512i t = _mm512_add_epi64(s1, s2);
945 __m512i w = _mm512_srai_epi64(t, e);
946 d = _mm512_sub_epi64(d, w);
947 _mm512_store_si512((__m512i*)dp, d);
952 int i = (int)h_width;
954 for (; i > 0; i -= 8, sp += 8, dp += 8)
956 __m512i s1 = _mm512_load_si512((__m512i*)sp);
957 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
958 __m512i d = _mm512_load_si512((__m512i*)dp);
959 __m512i t = _mm512_add_epi64(s1, s2);
960 __m512i v = _mm512_sub_epi64(vb, t);
961 __m512i w = _mm512_srai_epi64(v, e);
962 d = _mm512_add_epi64(d, w);
963 _mm512_store_si512((__m512i*)dp, d);
966 for (; i > 0; i -= 8, sp += 8, dp += 8)
968 __m512i s1 = _mm512_load_si512((__m512i*)sp);
969 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
970 __m512i d = _mm512_load_si512((__m512i*)dp);
971 __m512i t = _mm512_add_epi64(s1, s2);
972 __m512i v = _mm512_sub_epi64(vb, t);
973 __m512i w = _mm512_srai_epi64(v, e);
974 d = _mm512_add_epi64(d, w);
975 _mm512_store_si512((__m512i*)dp, d);
984 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
985 *dp += (b + a * (sp[0] + sp[1])) >> e;
987 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
988 *dp += (b + a * (sp[-1] + sp[0])) >> e;
1025 si64* t = lp; lp = hp; hp = t;
1027 ui32 w = l_width; l_width = h_width; h_width = w;
1032 ldst->i64[0] = src->i64[0];
1034 hdst->i64[0] = src->i64[0] << 1;
1040 const line_buf* hdst,
const line_buf* src,
1041 ui32 width,
bool even)
1047 avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even);
1054 avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even);
1059 void avx512_rev_horz_syn32(
const param_atk* atk,
const line_buf* dst,
1060 const line_buf* lsrc,
const line_buf* hsrc,
1061 ui32 width,
bool even)
1066 si32* oth = hsrc->i32, * aug = lsrc->i32;
1067 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1068 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1069 ui32 num_steps = atk->get_num_steps();
1070 for (
ui32 j = 0; j < num_steps; ++j)
1074 const si32 b = s->rev.Batk;
1075 const ui8 e = s->rev.Eatk;
1076 __m512i va = _mm512_set1_epi32(a);
1077 __m512i vb = _mm512_set1_epi32(b);
1081 oth[oth_width] = oth[oth_width - 1];
1083 const si32* sp = oth;
1087 int i = (int)aug_width;
1090 for (; i > 0; i -= 16, sp += 16, dp += 16)
1092 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1093 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1094 __m512i d = _mm512_load_si512((__m512i*)dp);
1095 __m512i t = _mm512_add_epi32(s1, s2);
1096 __m512i v = _mm512_add_epi32(vb, t);
1097 __m512i w = _mm512_srai_epi32(v, e);
1098 d = _mm512_sub_epi32(d, w);
1099 _mm512_store_si512((__m512i*)dp, d);
1104 for (; i > 0; i -= 16, sp += 16, dp += 16)
1106 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1107 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1108 __m512i d = _mm512_load_si512((__m512i*)dp);
1109 __m512i t = _mm512_add_epi32(s1, s2);
1110 __m512i v = _mm512_add_epi32(vb, t);
1111 __m512i w = _mm512_srai_epi32(v, e);
1112 d = _mm512_sub_epi32(d, w);
1113 _mm512_store_si512((__m512i*)dp, d);
1117 else if (a == -1 && b == 1 && e == 1)
1119 int i = (int)aug_width;
1121 for (; i > 0; i -= 16, sp += 16, dp += 16)
1123 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1124 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1125 __m512i d = _mm512_load_si512((__m512i*)dp);
1126 __m512i t = _mm512_add_epi32(s1, s2);
1127 __m512i w = _mm512_srai_epi32(t, e);
1128 d = _mm512_add_epi32(d, w);
1129 _mm512_store_si512((__m512i*)dp, d);
1132 for (; i > 0; i -= 16, sp += 16, dp += 16)
1134 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1135 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1136 __m512i d = _mm512_load_si512((__m512i*)dp);
1137 __m512i t = _mm512_add_epi32(s1, s2);
1138 __m512i w = _mm512_srai_epi32(t, e);
1139 d = _mm512_add_epi32(d, w);
1140 _mm512_store_si512((__m512i*)dp, d);
1145 int i = (int)aug_width;
1147 for (; i > 0; i -= 16, sp += 16, dp += 16)
1149 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1150 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1151 __m512i d = _mm512_load_si512((__m512i*)dp);
1152 __m512i t = _mm512_add_epi32(s1, s2);
1153 __m512i v = _mm512_sub_epi32(vb, t);
1154 __m512i w = _mm512_srai_epi32(v, e);
1155 d = _mm512_sub_epi32(d, w);
1156 _mm512_store_si512((__m512i*)dp, d);
1159 for (; i > 0; i -= 16, sp += 16, dp += 16)
1161 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1162 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1163 __m512i d = _mm512_load_si512((__m512i*)dp);
1164 __m512i t = _mm512_add_epi32(s1, s2);
1165 __m512i v = _mm512_sub_epi32(vb, t);
1166 __m512i w = _mm512_srai_epi32(v, e);
1167 d = _mm512_sub_epi32(d, w);
1168 _mm512_store_si512((__m512i*)dp, d);
1173 int i = (int)aug_width;
1175 for (; i > 0; i -= 16, sp += 16, dp += 16)
1177 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1178 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1179 __m512i d = _mm512_load_si512((__m512i*)dp);
1180 __m512i t = _mm512_add_epi32(s1, s2);
1181 __m512i u = _mm512_mullo_epi32(va, t);
1182 __m512i v = _mm512_add_epi32(vb, u);
1183 __m512i w = _mm512_srai_epi32(v, e);
1184 d = _mm512_sub_epi32(d, w);
1185 _mm512_store_si512((__m512i*)dp, d);
1188 for (; i > 0; i -= 16, sp += 16, dp += 16)
1190 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1191 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1192 __m512i d = _mm512_load_si512((__m512i*)dp);
1193 __m512i t = _mm512_add_epi32(s1, s2);
1194 __m512i u = _mm512_mullo_epi32(va, t);
1195 __m512i v = _mm512_add_epi32(vb, u);
1196 __m512i w = _mm512_srai_epi32(v, e);
1197 d = _mm512_sub_epi32(d, w);
1198 _mm512_store_si512((__m512i*)dp, d);
1203 si32* t = aug; aug = oth; oth = t;
1205 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1210 float* dp = dst->f32;
1211 float* spl = even ? lsrc->f32 : hsrc->f32;
1212 float* sph = even ? hsrc->f32 : lsrc->f32;
1214 avx512_interleave32(dp, spl, sph, w);
1219 dst->i32[0] = lsrc->i32[0];
1221 dst->i32[0] = hsrc->i32[0] >> 1;
1226 void avx512_rev_horz_syn64(
const param_atk* atk,
const line_buf* dst,
1227 const line_buf* lsrc,
const line_buf* hsrc,
1228 ui32 width,
bool even)
1233 si64* oth = hsrc->i64, * aug = lsrc->i64;
1234 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1235 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1236 ui32 num_steps = atk->get_num_steps();
1237 for (
ui32 j = 0; j < num_steps; ++j)
1241 const si32 b = s->rev.Batk;
1242 const ui8 e = s->rev.Eatk;
1243 __m512i vb = _mm512_set1_epi64(b);
1247 oth[oth_width] = oth[oth_width - 1];
1249 const si64* sp = oth;
1253 int i = (int)aug_width;
1256 for (; i > 0; i -= 8, sp += 8, dp += 8)
1258 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1259 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1260 __m512i d = _mm512_load_si512((__m512i*)dp);
1261 __m512i t = _mm512_add_epi64(s1, s2);
1262 __m512i v = _mm512_add_epi64(vb, t);
1263 __m512i w = _mm512_srai_epi64(v, e);
1264 d = _mm512_sub_epi64(d, w);
1265 _mm512_store_si512((__m512i*)dp, d);
1270 for (; i > 0; i -= 8, sp += 8, dp += 8)
1272 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1273 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1274 __m512i d = _mm512_load_si512((__m512i*)dp);
1275 __m512i t = _mm512_add_epi64(s1, s2);
1276 __m512i v = _mm512_add_epi64(vb, t);
1277 __m512i w = _mm512_srai_epi64(v, e);
1278 d = _mm512_sub_epi64(d, w);
1279 _mm512_store_si512((__m512i*)dp, d);
1283 else if (a == -1 && b == 1 && e == 1)
1285 int i = (int)aug_width;
1287 for (; i > 0; i -= 8, sp += 8, dp += 8)
1289 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1290 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1291 __m512i d = _mm512_load_si512((__m512i*)dp);
1292 __m512i t = _mm512_add_epi64(s1, s2);
1293 __m512i w = _mm512_srai_epi64(t, e);
1294 d = _mm512_add_epi64(d, w);
1295 _mm512_store_si512((__m512i*)dp, d);
1298 for (; i > 0; i -= 8, sp += 8, dp += 8)
1300 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1301 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1302 __m512i d = _mm512_load_si512((__m512i*)dp);
1303 __m512i t = _mm512_add_epi64(s1, s2);
1304 __m512i w = _mm512_srai_epi64(t, e);
1305 d = _mm512_add_epi64(d, w);
1306 _mm512_store_si512((__m512i*)dp, d);
1311 int i = (int)aug_width;
1313 for (; i > 0; i -= 8, sp += 8, dp += 8)
1315 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1316 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1317 __m512i d = _mm512_load_si512((__m512i*)dp);
1318 __m512i t = _mm512_add_epi64(s1, s2);
1319 __m512i v = _mm512_sub_epi64(vb, t);
1320 __m512i w = _mm512_srai_epi64(v, e);
1321 d = _mm512_sub_epi64(d, w);
1322 _mm512_store_si512((__m512i*)dp, d);
1325 for (; i > 0; i -= 8, sp += 8, dp += 8)
1327 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1328 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1329 __m512i d = _mm512_load_si512((__m512i*)dp);
1330 __m512i t = _mm512_add_epi64(s1, s2);
1331 __m512i v = _mm512_sub_epi64(vb, t);
1332 __m512i w = _mm512_srai_epi64(v, e);
1333 d = _mm512_sub_epi64(d, w);
1334 _mm512_store_si512((__m512i*)dp, d);
1343 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
1344 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
1346 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
1347 *dp -= (b + a * (sp[0] + sp[1])) >> e;
1384 si64* t = aug; aug = oth; oth = t;
1386 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1391 double* dp = (
double*)(dst->p);
1392 double* spl = (
double*)(even ? lsrc->p : hsrc->p);
1393 double* sph = (
double*)(even ? hsrc->p : lsrc->p);
1395 avx512_interleave64(dp, spl, sph, w);
1400 dst->i64[0] = lsrc->i64[0];
1402 dst->i64[0] = hsrc->i64[0] >> 1;
1408 const line_buf* lsrc,
const line_buf* hsrc,
1409 ui32 width,
bool even)
1415 avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1422 avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
void avx512_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx512_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx512_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx512_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void avx512_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)