39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
57 void sse_deinterleave32(
float* dpl,
float* dph,
float* sp,
int width)
59 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
61 __m128 a = _mm_load_ps(sp);
62 __m128 b = _mm_load_ps(sp + 4);
63 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
64 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
72 void sse_interleave32(
float* dp,
float* spl,
float* sph,
int width) \
74 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
76 __m128 a = _mm_load_ps(spl);
77 __m128 b = _mm_load_ps(sph);
78 __m128 c = _mm_unpacklo_ps(a, b);
79 __m128 d = _mm_unpackhi_ps(a, b);
81 _mm_store_ps(dp + 4, d);
86 static inline void sse_multiply_const(
float* p,
float f,
int width)
88 __m128 factor = _mm_set1_ps(f);
89 for (; width > 0; width -= 4, p += 4)
91 __m128 s = _mm_load_ps(p);
92 _mm_store_ps(p, _mm_mul_ps(factor, s));
98 const line_buf* other,
const line_buf* aug,
99 ui32 repeat,
bool synthesis)
101 float a = s->irv.Aatk;
105 __m128 factor = _mm_set1_ps(a);
107 float* dst = aug->f32;
108 const float* src1 = sig->f32, * src2 = other->f32;
110 for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
112 __m128 s1 = _mm_load_ps(src1);
113 __m128 s2 = _mm_load_ps(src2);
114 __m128 d = _mm_load_ps(dst);
115 d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
116 _mm_store_ps(dst, d);
123 sse_multiply_const(aug->f32, K, (
int)repeat);
128 const line_buf* hdst,
const line_buf* src,
129 ui32 width,
bool even)
135 float* dpl = even ? ldst->f32 : hdst->f32;
136 float* dph = even ? hdst->f32 : ldst->f32;
137 float* sp = src->f32;
139 sse_deinterleave32(dpl, dph, sp, w);
143 float* hp = hdst->f32, * lp = ldst->f32;
144 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
145 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
146 ui32 num_steps = atk->get_num_steps();
147 for (
ui32 j = num_steps; j > 0; --j)
154 lp[l_width] = lp[l_width - 1];
156 const float* sp = lp;
158 int i = (int)h_width;
159 __m128 f = _mm_set1_ps(a);
162 for (; i > 0; i -= 4, sp += 4, dp += 4)
164 __m128 m = _mm_load_ps(sp);
165 __m128 n = _mm_loadu_ps(sp + 1);
166 __m128 p = _mm_load_ps(dp);
167 p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
173 for (; i > 0; i -= 4, sp += 4, dp += 4)
175 __m128 m = _mm_load_ps(sp);
176 __m128 n = _mm_loadu_ps(sp - 1);
177 __m128 p = _mm_load_ps(dp);
178 p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
184 float* t = lp; lp = hp; hp = t;
186 ui32 w = l_width; l_width = h_width; h_width = w;
190 float K = atk->get_K();
191 float K_inv = 1.0f / K;
192 sse_multiply_const(lp, K_inv, (
int)l_width);
193 sse_multiply_const(hp, K, (
int)h_width);
198 ldst->f32[0] = src->f32[0];
200 hdst->f32[0] = src->f32[0] * 2.0f;
206 const line_buf* lsrc,
const line_buf* hsrc,
207 ui32 width,
bool even)
212 float* oth = hsrc->f32, * aug = lsrc->f32;
213 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
214 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
217 float K = atk->get_K();
218 float K_inv = 1.0f / K;
219 sse_multiply_const(aug, K, (
int)aug_width);
220 sse_multiply_const(oth, K_inv, (
int)oth_width);
224 ui32 num_steps = atk->get_num_steps();
225 for (
ui32 j = 0; j < num_steps; ++j)
232 oth[oth_width] = oth[oth_width - 1];
234 const float* sp = oth;
236 int i = (int)aug_width;
237 __m128 f = _mm_set1_ps(a);
240 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
242 __m128 m = _mm_load_ps(sp);
243 __m128 n = _mm_loadu_ps(sp - 1);
244 __m128 p = _mm_load_ps(dp);
245 p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
251 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
253 __m128 m = _mm_load_ps(sp);
254 __m128 n = _mm_loadu_ps(sp + 1);
255 __m128 p = _mm_load_ps(dp);
256 p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
262 float* t = aug; aug = oth; oth = t;
264 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
269 float* dp = dst->f32;
270 float* spl = even ? lsrc->f32 : hsrc->f32;
271 float* sph = even ? hsrc->f32 : lsrc->f32;
273 sse_interleave32(dp, spl, sph, w);
278 dst->f32[0] = lsrc->f32[0];
280 dst->f32[0] = hsrc->f32[0] * 0.5f;
void sse_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void sse_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)