OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_colour_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_avx2.cpp
34// Author: Aous Naman
35// Date: 11 October 2019
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <cmath>
43
44#include "ojph_defs.h"
45#include "ojph_mem.h"
46#include "ojph_colour.h"
47
48#include <immintrin.h>
49
50namespace ojph {
51 namespace local {
52
54 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
55 static inline
56 __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
57 {
58 // note than m must be obtained using
59 // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
60 __m256i x = _mm256_srli_epi64(a, amt);
61 x = _mm256_xor_si256(x, m);
62 __m256i result = _mm256_sub_epi64(x, m);
63 return result;
64 }
65
67 void avx2_rev_convert(const line_buf *src_line,
68 const ui32 src_line_offset,
69 line_buf *dst_line,
70 const ui32 dst_line_offset,
71 si64 shift, ui32 width)
72 {
73 if (src_line->flags & line_buf::LFT_32BIT)
74 {
75 if (dst_line->flags & line_buf::LFT_32BIT)
76 {
77 const si32 *sp = src_line->i32 + src_line_offset;
78 si32 *dp = dst_line->i32 + dst_line_offset;
79 __m256i sh = _mm256_set1_epi32((si32)shift);
80 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
81 {
82 __m256i s = _mm256_loadu_si256((__m256i*)sp);
83 s = _mm256_add_epi32(s, sh);
84 _mm256_storeu_si256((__m256i*)dp, s);
85 }
86 }
87 else
88 {
89 const si32 *sp = src_line->i32 + src_line_offset;
90 si64 *dp = dst_line->i64 + dst_line_offset;
91 __m256i sh = _mm256_set1_epi64x(shift);
92 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
93 {
94 __m256i s, t;
95 s = _mm256_loadu_si256((__m256i*)sp);
96
97 t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0));
98 t = _mm256_add_epi64(t, sh);
99 _mm256_storeu_si256((__m256i*)dp, t);
100
101 t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1));
102 t = _mm256_add_epi64(t, sh);
103 _mm256_storeu_si256((__m256i*)dp + 1, t);
104 }
105 }
106 }
107 else
108 {
109 assert(src_line->flags | line_buf::LFT_64BIT);
110 assert(dst_line->flags | line_buf::LFT_32BIT);
111 const si64 *sp = src_line->i64 + src_line_offset;
112 si32 *dp = dst_line->i32 + dst_line_offset;
113 __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
114 0, (si64)ULLONG_MAX);
115 __m256i sh = _mm256_set1_epi64x(shift);
116 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
117 {
118 __m256i s, t;
119 s = _mm256_loadu_si256((__m256i*)sp);
120 s = _mm256_add_epi64(s, sh);
121
122 t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
123 t = _mm256_and_si256(low_bits, t);
124
125 s = _mm256_loadu_si256((__m256i*)sp + 1);
126 s = _mm256_add_epi64(s, sh);
127
128 s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
129 s = _mm256_andnot_si256(low_bits, s);
130
131 t = _mm256_or_si256(s, t);
132 t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
133 _mm256_storeu_si256((__m256i*)dp, t);
134 }
135 }
136 }
137
139 void avx2_rev_convert_nlt_type3(const line_buf *src_line,
140 const ui32 src_line_offset,
141 line_buf *dst_line,
142 const ui32 dst_line_offset,
143 si64 shift, ui32 width)
144 {
145 if (src_line->flags & line_buf::LFT_32BIT)
146 {
147 if (dst_line->flags & line_buf::LFT_32BIT)
148 {
149 const si32 *sp = src_line->i32 + src_line_offset;
150 si32 *dp = dst_line->i32 + dst_line_offset;
151 __m256i sh = _mm256_set1_epi32((si32)(-shift));
152 __m256i zero = _mm256_setzero_si256();
153 for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
154 {
155 __m256i s = _mm256_loadu_si256((__m256i*)sp);
156 __m256i c = _mm256_cmpgt_epi32(zero, s); // 0xFFFFFFFF for -ve val
157 __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value
158 v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only -shift-val
159 s = _mm256_andnot_si256(c, s); // keep only +ve or 0
160 s = _mm256_or_si256(s, v_m_sh); // combine
161 _mm256_storeu_si256((__m256i*)dp, s);
162 }
163 }
164 else
165 {
166 const si32 *sp = src_line->i32 + src_line_offset;
167 si64 *dp = dst_line->i64 + dst_line_offset;
168 __m256i sh = _mm256_set1_epi64x(-shift);
169 __m256i zero = _mm256_setzero_si256();
170 for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
171 {
172 __m256i s, t, u0, u1, c, v_m_sh;
173 s = _mm256_loadu_si256((__m256i*)sp);
174
175 t = _mm256_cmpgt_epi32(zero, s); // find -ve 32bit -1
176 u0 = _mm256_unpacklo_epi32(s, t); // correct 64bit data
177 c = _mm256_unpacklo_epi32(t, t); // 64bit -1 for -ve value
178
179 v_m_sh = _mm256_sub_epi64(sh, u0); // - shift - value
180 v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
181 u0 = _mm256_andnot_si256(c, u0); // keep only +ve or 0
182 u0 = _mm256_or_si256(u0, v_m_sh); // combine
183
184 u1 = _mm256_unpackhi_epi32(s, t); // correct 64bit data
185 c = _mm256_unpackhi_epi32(t, t); // 64bit -1 for -ve value
186
187 v_m_sh = _mm256_sub_epi64(sh, u1); // - shift - value
188 v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
189 u1 = _mm256_andnot_si256(c, u1); // keep only +ve or 0
190 u1 = _mm256_or_si256(u1, v_m_sh); // combine
191
192 t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0);
193 _mm256_storeu_si256((__m256i*)dp, t);
194
195 t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1);
196 _mm256_storeu_si256((__m256i*)dp + 1, t);
197 }
198 }
199 }
200 else
201 {
202 assert(src_line->flags | line_buf::LFT_64BIT);
203 assert(dst_line->flags | line_buf::LFT_32BIT);
204 const si64 *sp = src_line->i64 + src_line_offset;
205 si32 *dp = dst_line->i32 + dst_line_offset;
206 __m256i sh = _mm256_set1_epi64x(-shift);
207 __m256i zero = _mm256_setzero_si256();
208 __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
209 0, (si64)ULLONG_MAX);
210 for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
211 {
212 // s for source, t for target, p for positive, n for negative,
213 // m for mask, and tm for temp
214 __m256i s, t, p, n, m, tm;
215 s = _mm256_loadu_si256((__m256i*)sp);
216
217 m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value
218 tm = _mm256_sub_epi64(sh, s); // - shift - value
219 n = _mm256_and_si256(m, tm); // -ve
220 p = _mm256_andnot_si256(m, s); // +ve
221 tm = _mm256_or_si256(n, p);
222 tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
223 t = _mm256_and_si256(half_mask, tm);
224
225 s = _mm256_loadu_si256((__m256i*)sp + 1);
226 m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value
227 tm = _mm256_sub_epi64(sh, s); // - shift - value
228 n = _mm256_and_si256(m, tm); // -ve
229 p = _mm256_andnot_si256(m, s); // +ve
230 tm = _mm256_or_si256(n, p);
231 tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
232 tm = _mm256_andnot_si256(half_mask, tm);
233
234 t = _mm256_or_si256(t, tm);
235 t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
236 _mm256_storeu_si256((__m256i*)dp, t);
237 }
238 }
239 }
240
242 static inline
243 __m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
244 {
245 // We must use _CMP_NLT_UQ or _CMP_GE_OQ, _CMP_GE_OS, or _CMP_NLT_US
246 // It is not clear to me which to use
247 __m256 ct = _mm256_cmp_ps(x, y, _CMP_NLT_UQ); // 0xFFFFFFFF for x >= y
248 __m256i c = _mm256_castps_si256(ct); // does not generate any code
249 __m256i d = _mm256_and_si256(c, a); // keep only a, where x >= y
250 __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x < y
251 return _mm256_or_si256(d, e); // combine
252 }
253
255 static inline
256 __m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
257 {
258 // We must use _CMP_LT_OQ or _CMP_NGE_UQ, _CMP_LT_OS, or _CMP_NGE_US
259 // It is not clear to me which to use
260 __m256 ct = _mm256_cmp_ps(x, y, _CMP_NGE_UQ); // 0xFFFFFFFF for x < y
261 __m256i c = _mm256_castps_si256(ct); // does not generate any code
262 __m256i d = _mm256_and_si256(c, a); // keep only a, where x < y
263 __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x >= y
264 return _mm256_or_si256(d, e); // combine
265 }
266
268 template<bool NLT_TYPE3>
269 static inline
270 void local_avx2_irv_convert_to_integer(const line_buf *src_line,
271 line_buf *dst_line, ui32 dst_line_offset,
272 ui32 bit_depth, bool is_signed, ui32 width)
273 {
274 assert((src_line->flags & line_buf::LFT_32BIT) &&
275 (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
276 (dst_line->flags & line_buf::LFT_32BIT) &&
277 (dst_line->flags & line_buf::LFT_INTEGER));
278
279 assert(bit_depth <= 32);
280 const float* sp = src_line->f32;
281 si32* dp = dst_line->i32 + dst_line_offset;
282 // There is the possibility that converting to integer will
283 // exceed the dynamic range of 32bit integer; therefore, care must be
284 // exercised.
285 // We look if the floating point number is outside the half-closed
286 // interval [-0.5f, 0.5f). If so, we limit the resulting integer
287 // to the maximum/minimum that number supports.
288 si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
289 __m256 mul = _mm256_set1_ps((float)(1ull << bit_depth));
290 __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit); // val < upper
291 __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit); // val >= lower
292 __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
293 __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
294
295 if (is_signed)
296 {
297 __m256i zero = _mm256_setzero_si256();
298 __m256i bias =
299 _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
300 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
301 __m256 t = _mm256_loadu_ps(sp);
302 t = _mm256_mul_ps(t, mul);
303 __m256i u = _mm256_cvtps_epi32(t);
304 u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
305 u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
306 if (NLT_TYPE3)
307 {
308 __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val
309 __m256i neg = _mm256_sub_epi32(bias, u); // -bias -value
310 neg = _mm256_and_si256(c, neg); // keep only - bias - val
311 u = _mm256_andnot_si256(c, u); // keep only +ve or 0
312 u = _mm256_or_si256(neg, u); // combine
313 }
314 _mm256_storeu_si256((__m256i*)dp, u);
315 }
316 }
317 else
318 {
319 __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
320 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
321 __m256 t = _mm256_loadu_ps(sp);
322 t = _mm256_mul_ps(t, mul);
323 __m256i u = _mm256_cvtps_epi32(t);
324 u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
325 u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
326 u = _mm256_add_epi32(u, half);
327 _mm256_storeu_si256((__m256i*)dp, u);
328 }
329 }
330 }
331
333 void avx2_irv_convert_to_integer(const line_buf *src_line,
334 line_buf *dst_line, ui32 dst_line_offset,
335 ui32 bit_depth, bool is_signed, ui32 width)
336 {
337 local_avx2_irv_convert_to_integer<false>(src_line, dst_line,
338 dst_line_offset, bit_depth, is_signed, width);
339 }
340
342 void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
343 line_buf *dst_line, ui32 dst_line_offset,
344 ui32 bit_depth, bool is_signed, ui32 width)
345 {
346 local_avx2_irv_convert_to_integer<true>(src_line, dst_line,
347 dst_line_offset, bit_depth, is_signed, width);
348 }
349
351 template<bool NLT_TYPE3>
352 static inline
353 void local_avx2_irv_convert_to_float(const line_buf *src_line,
354 ui32 src_line_offset, line_buf *dst_line,
355 ui32 bit_depth, bool is_signed, ui32 width)
356 {
357 assert((src_line->flags & line_buf::LFT_32BIT) &&
358 (src_line->flags & line_buf::LFT_INTEGER) &&
359 (dst_line->flags & line_buf::LFT_32BIT) &&
360 (dst_line->flags & line_buf::LFT_INTEGER) == 0);
361
362 assert(bit_depth <= 32);
363 __m256 mul = _mm256_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
364
365 const si32* sp = src_line->i32 + src_line_offset;
366 float* dp = dst_line->f32;
367 if (is_signed)
368 {
369 __m256i zero = _mm256_setzero_si256();
370 __m256i bias =
371 _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
372 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
373 __m256i t = _mm256_loadu_si256((__m256i*)sp);
374 if (NLT_TYPE3)
375 {
376 __m256i c = _mm256_cmpgt_epi32(zero, t); // 0xFFFFFFFF for -ve val
377 __m256i neg = _mm256_sub_epi32(bias, t); // - bias - value
378 neg = _mm256_and_si256(c, neg); // keep only - bias - val
379 c = _mm256_andnot_si256(c, t); // keep only +ve or 0
380 t = _mm256_or_si256(neg, c); // combine
381 }
382 __m256 v = _mm256_cvtepi32_ps(t);
383 v = _mm256_mul_ps(v, mul);
384 _mm256_storeu_ps(dp, v);
385 }
386 }
387 else
388 {
389 __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
390 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
391 __m256i t = _mm256_loadu_si256((__m256i*)sp);
392 t = _mm256_sub_epi32(t, half);
393 __m256 v = _mm256_cvtepi32_ps(t);
394 v = _mm256_mul_ps(v, mul);
395 _mm256_storeu_ps(dp, v);
396 }
397 }
398 }
399
401 void avx2_irv_convert_to_float(const line_buf *src_line,
402 ui32 src_line_offset, line_buf *dst_line,
403 ui32 bit_depth, bool is_signed, ui32 width)
404 {
405 local_avx2_irv_convert_to_float<false>(src_line, src_line_offset,
406 dst_line, bit_depth, is_signed, width);
407 }
408
410 void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
411 ui32 src_line_offset, line_buf *dst_line,
412 ui32 bit_depth, bool is_signed, ui32 width)
413 {
414 local_avx2_irv_convert_to_float<true>(src_line, src_line_offset,
415 dst_line, bit_depth, is_signed, width);
416 }
417
418
420 void avx2_rct_forward(const line_buf *r,
421 const line_buf *g,
422 const line_buf *b,
423 line_buf *y, line_buf *cb, line_buf *cr,
424 ui32 repeat)
425 {
426 assert((y->flags & line_buf::LFT_INTEGER) &&
427 (cb->flags & line_buf::LFT_INTEGER) &&
428 (cr->flags & line_buf::LFT_INTEGER) &&
429 (r->flags & line_buf::LFT_INTEGER) &&
430 (g->flags & line_buf::LFT_INTEGER) &&
431 (b->flags & line_buf::LFT_INTEGER));
432
433 if (y->flags & line_buf::LFT_32BIT)
434 {
435 assert((y->flags & line_buf::LFT_32BIT) &&
436 (cb->flags & line_buf::LFT_32BIT) &&
437 (cr->flags & line_buf::LFT_32BIT) &&
438 (r->flags & line_buf::LFT_32BIT) &&
439 (g->flags & line_buf::LFT_32BIT) &&
440 (b->flags & line_buf::LFT_32BIT));
441 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
442 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
443 for (int i = (repeat + 7) >> 3; i > 0; --i)
444 {
445 __m256i mr = _mm256_load_si256((__m256i*)rp);
446 __m256i mg = _mm256_load_si256((__m256i*)gp);
447 __m256i mb = _mm256_load_si256((__m256i*)bp);
448 __m256i t = _mm256_add_epi32(mr, mb);
449 t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
450 _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2));
451 t = _mm256_sub_epi32(mb, mg);
452 _mm256_store_si256((__m256i*)cbp, t);
453 t = _mm256_sub_epi32(mr, mg);
454 _mm256_store_si256((__m256i*)crp, t);
455
456 rp += 8; gp += 8; bp += 8;
457 yp += 8; cbp += 8; crp += 8;
458 }
459 }
460 else
461 {
462 assert((y->flags & line_buf::LFT_64BIT) &&
463 (cb->flags & line_buf::LFT_64BIT) &&
464 (cr->flags & line_buf::LFT_64BIT) &&
465 (r->flags & line_buf::LFT_32BIT) &&
466 (g->flags & line_buf::LFT_32BIT) &&
467 (b->flags & line_buf::LFT_32BIT));
468 __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
469 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
470 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
471 for (int i = (repeat + 7) >> 3; i > 0; --i)
472 {
473 __m256i mr32 = _mm256_load_si256((__m256i*)rp);
474 __m256i mg32 = _mm256_load_si256((__m256i*)gp);
475 __m256i mb32 = _mm256_load_si256((__m256i*)bp);
476 __m256i mr, mg, mb, t;
477 mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0));
478 mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0));
479 mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0));
480
481 t = _mm256_add_epi64(mr, mb);
482 t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
483 _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
484 t = _mm256_sub_epi64(mb, mg);
485 _mm256_store_si256((__m256i*)cbp, t);
486 t = _mm256_sub_epi64(mr, mg);
487 _mm256_store_si256((__m256i*)crp, t);
488
489 yp += 4; cbp += 4; crp += 4;
490
491 mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1));
492 mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1));
493 mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1));
494
495 t = _mm256_add_epi64(mr, mb);
496 t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
497 _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
498 t = _mm256_sub_epi64(mb, mg);
499 _mm256_store_si256((__m256i*)cbp, t);
500 t = _mm256_sub_epi64(mr, mg);
501 _mm256_store_si256((__m256i*)crp, t);
502
503 rp += 8; gp += 8; bp += 8;
504 yp += 4; cbp += 4; crp += 4;
505 }
506 }
507 }
508
510 void avx2_rct_backward(const line_buf *y,
511 const line_buf *cb,
512 const line_buf *cr,
513 line_buf *r, line_buf *g, line_buf *b,
514 ui32 repeat)
515 {
516 assert((y->flags & line_buf::LFT_INTEGER) &&
517 (cb->flags & line_buf::LFT_INTEGER) &&
518 (cr->flags & line_buf::LFT_INTEGER) &&
519 (r->flags & line_buf::LFT_INTEGER) &&
520 (g->flags & line_buf::LFT_INTEGER) &&
521 (b->flags & line_buf::LFT_INTEGER));
522
523 if (y->flags & line_buf::LFT_32BIT)
524 {
525 assert((y->flags & line_buf::LFT_32BIT) &&
526 (cb->flags & line_buf::LFT_32BIT) &&
527 (cr->flags & line_buf::LFT_32BIT) &&
528 (r->flags & line_buf::LFT_32BIT) &&
529 (g->flags & line_buf::LFT_32BIT) &&
530 (b->flags & line_buf::LFT_32BIT));
531 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
532 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
533 for (int i = (repeat + 7) >> 3; i > 0; --i)
534 {
535 __m256i my = _mm256_load_si256((__m256i*)yp);
536 __m256i mcb = _mm256_load_si256((__m256i*)cbp);
537 __m256i mcr = _mm256_load_si256((__m256i*)crp);
538
539 __m256i t = _mm256_add_epi32(mcb, mcr);
540 t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
541 _mm256_store_si256((__m256i*)gp, t);
542 __m256i u = _mm256_add_epi32(mcb, t);
543 _mm256_store_si256((__m256i*)bp, u);
544 u = _mm256_add_epi32(mcr, t);
545 _mm256_store_si256((__m256i*)rp, u);
546
547 yp += 8; cbp += 8; crp += 8;
548 rp += 8; gp += 8; bp += 8;
549 }
550 }
551 else
552 {
553 assert((y->flags & line_buf::LFT_64BIT) &&
554 (cb->flags & line_buf::LFT_64BIT) &&
555 (cr->flags & line_buf::LFT_64BIT) &&
556 (r->flags & line_buf::LFT_32BIT) &&
557 (g->flags & line_buf::LFT_32BIT) &&
558 (b->flags & line_buf::LFT_32BIT));
559 __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
560 __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
561 0, (si64)ULLONG_MAX);
562 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
563 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
564 for (int i = (repeat + 7) >> 3; i > 0; --i)
565 {
566 __m256i my, mcb, mcr, tr, tg, tb;
567 my = _mm256_load_si256((__m256i*)yp);
568 mcb = _mm256_load_si256((__m256i*)cbp);
569 mcr = _mm256_load_si256((__m256i*)crp);
570
571 tg = _mm256_add_epi64(mcb, mcr);
572 tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
573 tb = _mm256_add_epi64(mcb, tg);
574 tr = _mm256_add_epi64(mcr, tg);
575
576 __m256i mr, mg, mb;
577 mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
578 mr = _mm256_and_si256(low_bits, mr);
579 mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
580 mg = _mm256_and_si256(low_bits, mg);
581 mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
582 mb = _mm256_and_si256(low_bits, mb);
583
584 yp += 4; cbp += 4; crp += 4;
585
586 my = _mm256_load_si256((__m256i*)yp);
587 mcb = _mm256_load_si256((__m256i*)cbp);
588 mcr = _mm256_load_si256((__m256i*)crp);
589
590 tg = _mm256_add_epi64(mcb, mcr);
591 tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
592 tb = _mm256_add_epi64(mcb, tg);
593 tr = _mm256_add_epi64(mcr, tg);
594
595 tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
596 tr = _mm256_andnot_si256(low_bits, tr);
597 mr = _mm256_or_si256(mr, tr);
598 mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0));
599
600 tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
601 tg = _mm256_andnot_si256(low_bits, tg);
602 mg = _mm256_or_si256(mg, tg);
603 mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0));
604
605 tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
606 tb = _mm256_andnot_si256(low_bits, tb);
607 mb = _mm256_or_si256(mb, tb);
608 mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0));
609
610 _mm256_store_si256((__m256i*)rp, mr);
611 _mm256_store_si256((__m256i*)gp, mg);
612 _mm256_store_si256((__m256i*)bp, mb);
613
614 yp += 4; cbp += 4; crp += 4;
615 rp += 8; gp += 8; bp += 8;
616 }
617 }
618 }
619
620 }
621}
622
623#endif
void avx2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void avx2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void avx2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void avx2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void avx2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54