39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
56 static inline void avx_multiply_const(
float* p,
float f,
int width)
58 __m256 factor = _mm256_set1_ps(f);
59 for (; width > 0; width -= 8, p += 8)
61 __m256 s = _mm256_load_ps(p);
62 _mm256_store_ps(p, _mm256_mul_ps(factor, s));
68 void avx_deinterleave32(
float* dpl,
float* dph,
float* sp,
int width)
70 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
72 __m256 a = _mm256_load_ps(sp);
73 __m256 b = _mm256_load_ps(sp + 8);
74 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
75 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
76 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
77 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
78 _mm256_store_ps(dpl, e);
79 _mm256_store_ps(dph, f);
85 void avx_interleave32(
float* dp,
float* spl,
float* sph,
int width)
87 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
89 __m256 a = _mm256_load_ps(spl);
90 __m256 b = _mm256_load_ps(sph);
91 __m256 c = _mm256_unpacklo_ps(a, b);
92 __m256 d = _mm256_unpackhi_ps(a, b);
93 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
94 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
95 _mm256_store_ps(dp, e);
96 _mm256_store_ps(dp + 8, f);
102 const line_buf* other,
const line_buf* aug,
103 ui32 repeat,
bool synthesis)
105 float a = s->irv.Aatk;
109 __m256 factor = _mm256_set1_ps(a);
111 float* dst = aug->f32;
112 const float* src1 = sig->f32, * src2 = other->f32;
114 for ( ; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
116 __m256 s1 = _mm256_load_ps(src1);
117 __m256 s2 = _mm256_load_ps(src2);
118 __m256 d = _mm256_load_ps(dst);
119 d = _mm256_add_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
120 _mm256_store_ps(dst, d);
127 avx_multiply_const(aug->f32, K, (
int)repeat);
132 const line_buf* hdst,
const line_buf* src,
133 ui32 width,
bool even)
139 float* dpl = even ? ldst->f32 : hdst->f32;
140 float* dph = even ? hdst->f32 : ldst->f32;
141 float* sp = src->f32;
143 avx_deinterleave32(dpl, dph, sp, w);
147 float* hp = hdst->f32, * lp = ldst->f32;
148 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
149 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
150 ui32 num_steps = atk->get_num_steps();
151 for (
ui32 j = num_steps; j > 0; --j)
158 lp[l_width] = lp[l_width - 1];
160 const float* sp = lp;
162 int i = (int)h_width;
163 __m256 f = _mm256_set1_ps(a);
166 for (; i > 0; i -= 8, sp += 8, dp += 8)
168 __m256 m = _mm256_load_ps(sp);
169 __m256 n = _mm256_loadu_ps(sp + 1);
170 __m256 p = _mm256_load_ps(dp);
171 p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
172 _mm256_store_ps(dp, p);
177 for (; i > 0; i -= 8, sp += 8, dp += 8)
179 __m256 m = _mm256_load_ps(sp);
180 __m256 n = _mm256_loadu_ps(sp - 1);
181 __m256 p = _mm256_load_ps(dp);
182 p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
183 _mm256_store_ps(dp, p);
188 float* t = lp; lp = hp; hp = t;
190 ui32 w = l_width; l_width = h_width; h_width = w;
194 float K = atk->get_K();
195 float K_inv = 1.0f / K;
196 avx_multiply_const(lp, K_inv, (
int)l_width);
197 avx_multiply_const(hp, K, (
int)h_width);
202 ldst->f32[0] = src->f32[0];
204 hdst->f32[0] = src->f32[0] * 2.0f;
210 const line_buf* lsrc,
const line_buf* hsrc,
211 ui32 width,
bool even)
216 float* oth = hsrc->f32, * aug = lsrc->f32;
217 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
218 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
221 float K = atk->get_K();
222 float K_inv = 1.0f / K;
223 avx_multiply_const(aug, K, (
int)aug_width);
224 avx_multiply_const(oth, K_inv, (
int)oth_width);
228 ui32 num_steps = atk->get_num_steps();
229 for (
ui32 j = 0; j < num_steps; ++j)
236 oth[oth_width] = oth[oth_width - 1];
238 const float* sp = oth;
240 int i = (int)aug_width;
241 __m256 f = _mm256_set1_ps(a);
244 for (; i > 0; i -= 8, sp += 8, dp += 8)
246 __m256 m = _mm256_load_ps(sp);
247 __m256 n = _mm256_loadu_ps(sp - 1);
248 __m256 p = _mm256_load_ps(dp);
249 p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
250 _mm256_store_ps(dp, p);
255 for (; i > 0; i -= 8, sp += 8, dp += 8)
257 __m256 m = _mm256_load_ps(sp);
258 __m256 n = _mm256_loadu_ps(sp + 1);
259 __m256 p = _mm256_load_ps(dp);
260 p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
261 _mm256_store_ps(dp, p);
266 float* t = aug; aug = oth; oth = t;
268 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
273 float* dp = dst->f32;
274 float* spl = even ? lsrc->f32 : hsrc->f32;
275 float* sph = even ? hsrc->f32 : lsrc->f32;
277 avx_interleave32(dp, spl, sph, w);
282 dst->f32[0] = lsrc->f32[0];
284 dst->f32[0] = hsrc->f32[0] * 0.5f;
void avx_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)