OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_img_io_sse41.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_img_io_sse41.cpp
34// Author: Aous Naman
35// Date: 23 May 2022
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) \
40 || defined(OJPH_ARCH_X86_64) \
41 || defined(OJPH_ENABLE_WASM_SIMD)
42
43#include <cstdlib>
44#include <cstring>
45#include <immintrin.h>
46
47#include "ojph_file.h"
48#include "ojph_img_io.h"
49#include "ojph_mem.h"
50#include "ojph_message.h"
51
52namespace ojph {
53
55 static
56 ui16 be2le(const ui16 v)
57 {
58 return (ui16)((v<<8) | (v>>8));
59 }
60
62 void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1,
63 const line_buf *ln2, void *dp,
64 ui32 bit_depth, ui32 count)
65 {
66 ojph_unused(ln1);
67 ojph_unused(ln2);
68
69 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
70 __m128i zero = _mm_setzero_si128();
71 __m128i mask = _mm_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400);
72 const si32 *sp = ln0->i32;
73 ui8* p = (ui8 *)dp;
74
75 // 16 bytes or entries in each loop
76 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
77 {
78 __m128i a, t;
79 a = _mm_load_si128((__m128i*)sp);
80 a = _mm_max_epi32(a, zero);
81 t = _mm_min_epi32(a, max_val_vec);
82
83 a = _mm_load_si128((__m128i*)sp + 1);
84 a = _mm_max_epi32(a, zero);
85 a = _mm_min_epi32(a, max_val_vec);
86 a = _mm_slli_epi32(a, 8);
87 t = _mm_or_si128(t, a);
88
89 a = _mm_load_si128((__m128i*)sp + 2);
90 a = _mm_max_epi32(a, zero);
91 a = _mm_min_epi32(a, max_val_vec);
92 a = _mm_slli_epi32(a, 16);
93 t = _mm_or_si128(t, a);
94
95 a = _mm_load_si128((__m128i*)sp + 3);
96 a = _mm_max_epi32(a, zero);
97 a = _mm_min_epi32(a, max_val_vec);
98 a = _mm_slli_epi32(a, 24);
99 t = _mm_or_si128(t, a);
100
101 t = _mm_shuffle_epi8(t, mask);
102 _mm_storeu_si128((__m128i*)p, t);
103 }
104
105 int max_val = (1 << bit_depth) - 1;
106 for ( ; count > 0; --count)
107 {
108 int val = *sp++;
109 val = val >= 0 ? val : 0;
110 val = val <= max_val ? val : max_val;
111 *p++ = (ui8)val;
112 }
113 }
114
116 void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1,
117 const line_buf *ln2, void *dp,
118 ui32 bit_depth, ui32 count)
119 {
120 const si32 *sp0 = ln0->i32;
121 const si32 *sp1 = ln1->i32;
122 const si32 *sp2 = ln2->i32;
123 ui8* p = (ui8 *)dp;
124
125 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
126 __m128i zero = _mm_setzero_si128();
127 __m128i m0 = _mm_set_epi64x((si64)0xFFFFFFFF0E0D0C0A,
128 (si64)0x0908060504020100);
129
130 // 16 entries in each loop
131 for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
132 {
133 __m128i a, t, u, v, w;
134 a = _mm_load_si128((__m128i*)sp0);
135 a = _mm_max_epi32(a, zero);
136 t = _mm_min_epi32(a, max_val_vec);
137
138 a = _mm_load_si128((__m128i*)sp1);
139 a = _mm_max_epi32(a, zero);
140 a = _mm_min_epi32(a, max_val_vec);
141 a = _mm_slli_epi32(a, 8);
142 t = _mm_or_si128(t, a);
143
144 a = _mm_load_si128((__m128i*)sp2);
145 a = _mm_max_epi32(a, zero);
146 a = _mm_min_epi32(a, max_val_vec);
147 a = _mm_slli_epi32(a, 16);
148 t = _mm_or_si128(t, a);
149 t = _mm_shuffle_epi8(t, m0);
150
151 a = _mm_load_si128((__m128i*)sp0 + 1);
152 a = _mm_max_epi32(a, zero);
153 u = _mm_min_epi32(a, max_val_vec);
154
155 a = _mm_load_si128((__m128i*)sp1 + 1);
156 a = _mm_max_epi32(a, zero);
157 a = _mm_min_epi32(a, max_val_vec);
158 a = _mm_slli_epi32(a, 8);
159 u = _mm_or_si128(u, a);
160
161 a = _mm_load_si128((__m128i*)sp2 + 1);
162 a = _mm_max_epi32(a, zero);
163 a = _mm_min_epi32(a, max_val_vec);
164 a = _mm_slli_epi32(a, 16);
165 u = _mm_or_si128(u, a);
166 u = _mm_shuffle_epi8(u, m0);
167
168 a = _mm_load_si128((__m128i*)sp0 + 2);
169 a = _mm_max_epi32(a, zero);
170 v = _mm_min_epi32(a, max_val_vec);
171
172 a = _mm_load_si128((__m128i*)sp1 + 2);
173 a = _mm_max_epi32(a, zero);
174 a = _mm_min_epi32(a, max_val_vec);
175 a = _mm_slli_epi32(a, 8);
176 v = _mm_or_si128(v, a);
177
178 a = _mm_load_si128((__m128i*)sp2 + 2);
179 a = _mm_max_epi32(a, zero);
180 a = _mm_min_epi32(a, max_val_vec);
181 a = _mm_slli_epi32(a, 16);
182 v = _mm_or_si128(v, a);
183 v = _mm_shuffle_epi8(v, m0);
184
185 a = _mm_load_si128((__m128i*)sp0 + 3);
186 a = _mm_max_epi32(a, zero);
187 w = _mm_min_epi32(a, max_val_vec);
188
189 a = _mm_load_si128((__m128i*)sp1 + 3);
190 a = _mm_max_epi32(a, zero);
191 a = _mm_min_epi32(a, max_val_vec);
192 a = _mm_slli_epi32(a, 8);
193 w = _mm_or_si128(w, a);
194
195 a = _mm_load_si128((__m128i*)sp2 + 3);
196 a = _mm_max_epi32(a, zero);
197 a = _mm_min_epi32(a, max_val_vec);
198 a = _mm_slli_epi32(a, 16);
199 w = _mm_or_si128(w, a);
200 w = _mm_shuffle_epi8(w, m0);
201
202 t = _mm_or_si128(t, _mm_bslli_si128(u, 12));
203 u = _mm_or_si128(_mm_bsrli_si128(u, 4), _mm_bslli_si128(v, 8));
204 v = _mm_or_si128(_mm_bsrli_si128(v, 8), _mm_bslli_si128(w, 4));
205
206 _mm_storeu_si128((__m128i*)p + 0, t);
207 _mm_storeu_si128((__m128i*)p + 1, u);
208 _mm_storeu_si128((__m128i*)p + 2, v);
209 }
210
211 int max_val = (1<<bit_depth) - 1;
212 for ( ; count > 0; --count)
213 {
214 int val;
215 val = *sp0++;
216 val = val >= 0 ? val : 0;
217 val = val <= max_val ? val : max_val;
218 *p++ = (ui8) val;
219 val = *sp1++;
220 val = val >= 0 ? val : 0;
221 val = val <= max_val ? val : max_val;
222 *p++ = (ui8) val;
223 val = *sp2++;
224 val = val >= 0 ? val : 0;
225 val = val <= max_val ? val : max_val;
226 *p++ = (ui8) val;
227 }
228 }
229
231 void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
232 const line_buf *ln2, void *dp,
233 ui32 bit_depth, ui32 count)
234 {
235 ojph_unused(ln1);
236 ojph_unused(ln2);
237
238 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
239 __m128i zero = _mm_setzero_si128();
240 __m128i mask = _mm_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100);
241 const si32 *sp = ln0->i32;
242 ui16* p = (ui16 *)dp;
243
244 // 8 entries in each loop
245 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
246 {
247 __m128i a, t;
248 a = _mm_load_si128((__m128i*)sp);
249 a = _mm_max_epi32(a, zero);
250 t = _mm_min_epi32(a, max_val_vec);
251
252 a = _mm_load_si128((__m128i*)sp + 1);
253 a = _mm_max_epi32(a, zero);
254 a = _mm_min_epi32(a, max_val_vec);
255 a = _mm_slli_epi32(a, 16);
256 t = _mm_or_si128(t, a);
257
258 t = _mm_shuffle_epi8(t, mask);
259 _mm_storeu_si128((__m128i*)p, t);
260 }
261
262 int max_val = (1<<bit_depth) - 1;
263 for ( ; count > 0; --count)
264 {
265 int val = *sp++;
266 val = val >= 0 ? val : 0;
267 val = val <= max_val ? val : max_val;
268 *p++ = (ui16) val;
269 }
270 }
271
273 void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
274 const line_buf *ln2, void *dp,
275 ui32 bit_depth, ui32 count)
276 {
277 const si32 *sp0 = ln0->i32;
278 const si32 *sp1 = ln1->i32;
279 const si32 *sp2 = ln2->i32;
280 ui16* p = (ui16*)dp;
281
282 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
283 __m128i zero = _mm_setzero_si128();
284
285 __m128i m0 = _mm_set_epi64x((si64)0x0B0A0908FFFF0706,
286 (si64)0x0504FFFF03020100);
287 __m128i m1 = _mm_set_epi64x((si64)0xFFFFFFFF0504FFFF,
288 (si64)0xFFFF0100FFFFFFFF);
289 __m128i m2 = _mm_set_epi64x((si64)0xFFFFFFFFFFFFFFFF,
290 (si64)0xFFFF0F0E0D0CFFFF);
291 __m128i m3 = _mm_set_epi64x((si64)0x0706FFFFFFFF0302,
292 (si64)0x0D0CFFFFFFFF0908);
293 __m128i m4 = _mm_set_epi64x((si64)0xFFFF03020100FFFF,
294 (si64)0xFFFFFFFFFFFFFFFF);
295 __m128i m5 = _mm_set_epi64x((si64)0xFFFFFFFF0F0EFFFF,
296 (si64)0xFFFF0B0AFFFFFFFF);
297 __m128i m6 = _mm_set_epi64x((si64)0x0F0E0D0CFFFF0B0A,
298 (si64)0x0908FFFF07060504);
299
300 // 24 entries in each loop
301 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
302 {
303 __m128i a, b, t, u, v;
304 a = _mm_load_si128((__m128i*)sp0);
305 a = _mm_max_epi32(a, zero);
306 t = _mm_min_epi32(a, max_val_vec);
307
308 a = _mm_load_si128((__m128i*)sp1);
309 a = _mm_max_epi32(a, zero);
310 a = _mm_min_epi32(a, max_val_vec);
311 a = _mm_slli_epi32(a, 16);
312 t = _mm_or_si128(t, a);
313
314 a = _mm_load_si128((__m128i*)sp2);
315 a = _mm_max_epi32(a, zero);
316 u = _mm_min_epi32(a, max_val_vec);
317
318 a = _mm_load_si128((__m128i*)sp0 + 1);
319 a = _mm_max_epi32(a, zero);
320 a = _mm_min_epi32(a, max_val_vec);
321 a = _mm_slli_epi32(a, 16);
322 u = _mm_or_si128(u, a);
323
324 a = _mm_load_si128((__m128i*)sp1 + 1);
325 a = _mm_max_epi32(a, zero);
326 v = _mm_min_epi32(a, max_val_vec);
327
328 a = _mm_load_si128((__m128i*)sp2 + 1);
329 a = _mm_max_epi32(a, zero);
330 a = _mm_min_epi32(a, max_val_vec);
331 a = _mm_slli_epi32(a, 16);
332 v = _mm_or_si128(v, a);
333
334 a = _mm_shuffle_epi8(t, m0);
335 b = _mm_shuffle_epi8(u, m1);
336 a = _mm_or_si128(a, b);
337 _mm_storeu_si128((__m128i*)p, a);
338
339 a = _mm_shuffle_epi8(t, m2);
340 b = _mm_shuffle_epi8(u, m3);
341 a = _mm_or_si128(a, b);
342 b = _mm_shuffle_epi8(v, m4);
343 a = _mm_or_si128(a, b);
344 _mm_storeu_si128((__m128i*)p + 1, a);
345
346 a = _mm_shuffle_epi8(u, m5);
347 b = _mm_shuffle_epi8(v, m6);
348 a = _mm_or_si128(a, b);
349 _mm_storeu_si128((__m128i*)p + 2, a);
350 }
351
352 int max_val = (1<<bit_depth) - 1;
353 for ( ; count > 0; --count)
354 {
355 int val;
356 val = *sp0++;
357 val = val >= 0 ? val : 0;
358 val = val <= max_val ? val : max_val;
359 *p++ = be2le((ui16) val);
360 val = *sp1++;
361 val = val >= 0 ? val : 0;
362 val = val <= max_val ? val : max_val;
363 *p++ = be2le((ui16) val);
364 val = *sp2++;
365 val = val >= 0 ? val : 0;
366 val = val <= max_val ? val : max_val;
367 *p++ = (ui16) val;
368 }
369 }
370
372 void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
373 const line_buf *ln2, void *dp,
374 ui32 bit_depth, ui32 count)
375 {
376 ojph_unused(ln1);
377 ojph_unused(ln2);
378
379 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
380 __m128i zero = _mm_setzero_si128();
381 __m128i mask = _mm_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001);
382 const si32 *sp = ln0->i32;
383 ui16* p = (ui16 *)dp;
384
385 // 8 entries in each loop
386 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
387 {
388 __m128i a, t;
389 a = _mm_load_si128((__m128i*)sp);
390 a = _mm_max_epi32(a, zero);
391 t = _mm_min_epi32(a, max_val_vec);
392
393 a = _mm_load_si128((__m128i*)sp + 1);
394 a = _mm_max_epi32(a, zero);
395 a = _mm_min_epi32(a, max_val_vec);
396 a = _mm_slli_epi32(a, 16);
397 t = _mm_or_si128(t, a);
398
399 t = _mm_shuffle_epi8(t, mask);
400 _mm_storeu_si128((__m128i*)p, t);
401 }
402
403 int max_val = (1<<bit_depth) - 1;
404 for ( ; count > 0; --count)
405 {
406 int val = *sp++;
407 val = val >= 0 ? val : 0;
408 val = val <= max_val ? val : max_val;
409 *p++ = be2le((ui16) val);
410 }
411 }
412
414 void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
415 const line_buf *ln2, void *dp,
416 ui32 bit_depth, ui32 count)
417 {
418 const si32 *sp0 = ln0->i32;
419 const si32 *sp1 = ln1->i32;
420 const si32 *sp2 = ln2->i32;
421 ui16* p = (ui16*)dp;
422
423 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
424 __m128i zero = _mm_setzero_si128();
425
426 __m128i m0 = _mm_set_epi64x((si64)0x0A0B0809FFFF0607,
427 (si64)0x0405FFFF02030001);
428 __m128i m1 = _mm_set_epi64x((si64)0xFFFFFFFF0405FFFF,
429 (si64)0xFFFF0001FFFFFFFF);
430 __m128i m2 = _mm_set_epi64x((si64)0xFFFFFFFFFFFFFFFF,
431 (si64)0xFFFF0E0F0C0DFFFF);
432 __m128i m3 = _mm_set_epi64x((si64)0x0607FFFFFFFF0203,
433 (si64)0x0C0DFFFFFFFF0809);
434 __m128i m4 = _mm_set_epi64x((si64)0xFFFF02030001FFFF,
435 (si64)0xFFFFFFFFFFFFFFFF);
436 __m128i m5 = _mm_set_epi64x((si64)0xFFFFFFFF0E0FFFFF,
437 (si64)0xFFFF0A0BFFFFFFFF);
438 __m128i m6 = _mm_set_epi64x((si64)0x0E0F0C0DFFFF0A0B,
439 (si64)0x0809FFFF06070405);
440
441 // 24 entries in each loop
442 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
443 {
444 __m128i a, b, t, u, v;
445 a = _mm_load_si128((__m128i*)sp0);
446 a = _mm_max_epi32(a, zero);
447 t = _mm_min_epi32(a, max_val_vec);
448
449 a = _mm_load_si128((__m128i*)sp1);
450 a = _mm_max_epi32(a, zero);
451 a = _mm_min_epi32(a, max_val_vec);
452 a = _mm_slli_epi32(a, 16);
453 t = _mm_or_si128(t, a);
454
455 a = _mm_load_si128((__m128i*)sp2);
456 a = _mm_max_epi32(a, zero);
457 u = _mm_min_epi32(a, max_val_vec);
458
459 a = _mm_load_si128((__m128i*)sp0 + 1);
460 a = _mm_max_epi32(a, zero);
461 a = _mm_min_epi32(a, max_val_vec);
462 a = _mm_slli_epi32(a, 16);
463 u = _mm_or_si128(u, a);
464
465 a = _mm_load_si128((__m128i*)sp1 + 1);
466 a = _mm_max_epi32(a, zero);
467 v = _mm_min_epi32(a, max_val_vec);
468
469 a = _mm_load_si128((__m128i*)sp2 + 1);
470 a = _mm_max_epi32(a, zero);
471 a = _mm_min_epi32(a, max_val_vec);
472 a = _mm_slli_epi32(a, 16);
473 v = _mm_or_si128(v, a);
474
475 a = _mm_shuffle_epi8(t, m0);
476 b = _mm_shuffle_epi8(u, m1);
477 a = _mm_or_si128(a, b);
478 _mm_storeu_si128((__m128i*)p, a);
479
480 a = _mm_shuffle_epi8(t, m2);
481 b = _mm_shuffle_epi8(u, m3);
482 a = _mm_or_si128(a, b);
483 b = _mm_shuffle_epi8(v, m4);
484 a = _mm_or_si128(a, b);
485 _mm_storeu_si128((__m128i*)p + 1, a);
486
487 a = _mm_shuffle_epi8(u, m5);
488 b = _mm_shuffle_epi8(v, m6);
489 a = _mm_or_si128(a, b);
490 _mm_storeu_si128((__m128i*)p + 2, a);
491 }
492
493 int max_val = (1<<bit_depth) - 1;
494 for ( ; count > 0; --count)
495 {
496 int val;
497 val = *sp0++;
498 val = val >= 0 ? val : 0;
499 val = val <= max_val ? val : max_val;
500 *p++ = be2le((ui16) val);
501 val = *sp1++;
502 val = val >= 0 ? val : 0;
503 val = val <= max_val ? val : max_val;
504 *p++ = be2le((ui16) val);
505 val = *sp2++;
506 val = val >= 0 ? val : 0;
507 val = val <= max_val ? val : max_val;
508 *p++ = be2le((ui16) val);
509 }
510 }
511}
512
513#endif
void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
int64_t si64
Definition ojph_defs.h:57
uint16_t ui16
Definition ojph_defs.h:52
void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
static ui16 be2le(const ui16 v)
void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
int32_t si32
Definition ojph_defs.h:55
void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define ojph_unused(x)
Definition ojph_defs.h:78