39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
54 float *y,
float *cb,
float *cr,
ui32 repeat)
61 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
63 __m128 mr = _mm_load_ps(r);
64 __m128 mb = _mm_load_ps(b);
65 __m128 my = _mm_mul_ps(alpha_rf, mr);
66 my = _mm_add_ps(my, _mm_mul_ps(alpha_gf, _mm_load_ps(g)));
67 my = _mm_add_ps(my, _mm_mul_ps(alpha_bf, mb));
69 _mm_store_ps(cb, _mm_mul_ps(beta_cbf, _mm_sub_ps(mb, my)));
70 _mm_store_ps(cr, _mm_mul_ps(beta_crf, _mm_sub_ps(mr, my)));
72 r += 4; g += 4; b += 4;
73 y += 4; cb += 4; cr += 4;
79 float *r,
float *g,
float *b,
ui32 repeat)
85 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
87 __m128 my = _mm_load_ps(y);
88 __m128 mcr = _mm_load_ps(cr);
89 __m128 mcb = _mm_load_ps(cb);
90 __m128 mg = _mm_sub_ps(my, _mm_mul_ps(gamma_cr2g, mcr));
91 _mm_store_ps(g, _mm_sub_ps(mg, _mm_mul_ps(gamma_cb2g, mcb)));
92 _mm_store_ps(r, _mm_add_ps(my, _mm_mul_ps(gamma_cr2r, mcr)));
93 _mm_store_ps(b, _mm_add_ps(my, _mm_mul_ps(gamma_cb2b, mcb)));
95 y += 4; cb += 4; cr += 4;
96 r += 4; g += 4; b += 4;
void sse_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void sse_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF