OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_img_io_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_img_io_avx2.cpp
34// Author: Aous Naman
35// Date: 23 May 2022
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <cstdlib>
42#include <cstring>
43#include <immintrin.h>
44
45#include "ojph_file.h"
46#include "ojph_img_io.h"
47#include "ojph_mem.h"
48#include "ojph_message.h"
49
50namespace ojph {
51
53 static
54 ui16 be2le(const ui16 v)
55 {
56 return (ui16)((v<<8) | (v>>8));
57 }
58
60 void avx2_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1,
61 const line_buf *ln2, void *dp,
62 ui32 bit_depth, ui32 count)
63 {
64 ojph_unused(ln1);
65 ojph_unused(ln2);
66
67 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
68 __m256i zero = _mm256_setzero_si256();
69 __m256i mask = _mm256_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400,
70 0x0F0B07030E0A0602, 0x0D0905010C080400);
71 const si32 *sp = ln0->i32;
72 ui8* p = (ui8 *)dp;
73
74 // 32 bytes or entries in each loop
75 for ( ; count >= 32; count -= 32, sp += 32, p += 32)
76 {
77 __m256i a, t, u, v0, v1;
78 a = _mm256_load_si256((__m256i*)sp);
79 a = _mm256_max_epi32(a, zero);
80 t = _mm256_min_epi32(a, max_val_vec);
81
82 a = _mm256_load_si256((__m256i*)sp + 1);
83 a = _mm256_max_epi32(a, zero);
84 a = _mm256_min_epi32(a, max_val_vec);
85 a = _mm256_slli_epi32(a, 16);
86 t = _mm256_or_si256(t, a);
87
88 a = _mm256_load_si256((__m256i*)sp + 2);
89 a = _mm256_max_epi32(a, zero);
90 u = _mm256_min_epi32(a, max_val_vec);
91
92 a = _mm256_load_si256((__m256i*)sp + 3);
93 a = _mm256_max_epi32(a, zero);
94 a = _mm256_min_epi32(a, max_val_vec);
95 a = _mm256_slli_epi32(a, 16);
96 u = _mm256_or_si256(u, a);
97
98 v0 = _mm256_permute2x128_si256(t, u, 0x20);
99 v1 = _mm256_permute2x128_si256(t, u, 0x31);
100 v1 = _mm256_slli_epi32(v1, 8);
101 v0 = _mm256_or_si256(v0, v1);
102
103 v0 = _mm256_shuffle_epi8(v0, mask);
104 _mm256_storeu_si256((__m256i*)p, v0);
105 }
106
107 int max_val = (1 << bit_depth) - 1;
108 for ( ; count > 0; --count)
109 {
110 int val = *sp++;
111 val = val >= 0 ? val : 0;
112 val = val <= max_val ? val : max_val;
113 *p++ = (ui8)val;
114 }
115 }
116
118 void avx2_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1,
119 const line_buf *ln2, void *dp,
120 ui32 bit_depth, ui32 count)
121 {
122 int max_val = (1 << bit_depth) - 1;
123 __m256i max_val_vec = _mm256_set1_epi32(max_val);
124 __m256i zero = _mm256_setzero_si256();
125 __m256i m0 = _mm256_set_epi64x((si64)0xFFFFFFFF0E0D0C0A,
126 (si64)0x0908060504020100,
127 (si64)0xFFFFFFFF0E0D0C0A,
128 (si64)0x0908060504020100);
129
130 // 32 entries in each loop
131 const __m256i* sp0 = (__m256i*)ln0->i32;
132 const __m256i* sp1 = (__m256i*)ln1->i32;
133 const __m256i* sp2 = (__m256i*)ln2->i32;
134 ui8* p = (ui8*)dp;
135 for ( ; count >= 32; count -= 32, sp0 += 4, sp1 += 4, sp2 += 4, p += 96)
136 {
137 __m256i a, t, u, v, w;
138
139 a = _mm256_load_si256(sp0);
140 a = _mm256_max_epi32(a, zero);
141 t = _mm256_min_epi32(a, max_val_vec);
142
143 a = _mm256_load_si256(sp1);
144 a = _mm256_max_epi32(a, zero);
145 a = _mm256_min_epi32(a, max_val_vec);
146 a = _mm256_slli_epi32(a, 8);
147 t = _mm256_or_si256(t, a);
148
149 a = _mm256_load_si256(sp2);
150 a = _mm256_max_epi32(a, zero);
151 a = _mm256_min_epi32(a, max_val_vec);
152 a = _mm256_slli_epi32(a, 16);
153 t = _mm256_or_si256(t, a);
154 t = _mm256_shuffle_epi8(t, m0);
155
156
157 a = _mm256_load_si256(sp0 + 1);
158 a = _mm256_max_epi32(a, zero);
159 u = _mm256_min_epi32(a, max_val_vec);
160
161 a = _mm256_load_si256(sp1 + 1);
162 a = _mm256_max_epi32(a, zero);
163 a = _mm256_min_epi32(a, max_val_vec);
164 a = _mm256_slli_epi32(a, 8);
165 u = _mm256_or_si256(u, a);
166
167 a = _mm256_load_si256(sp2 + 1);
168 a = _mm256_max_epi32(a, zero);
169 a = _mm256_min_epi32(a, max_val_vec);
170 a = _mm256_slli_epi32(a, 16);
171 u = _mm256_or_si256(u, a);
172 u = _mm256_shuffle_epi8(u, m0);
173
174
175 a = _mm256_load_si256(sp0 + 2);
176 a = _mm256_max_epi32(a, zero);
177 v = _mm256_min_epi32(a, max_val_vec);
178
179 a = _mm256_load_si256(sp1 + 2);
180 a = _mm256_max_epi32(a, zero);
181 a = _mm256_min_epi32(a, max_val_vec);
182 a = _mm256_slli_epi32(a, 8);
183 v = _mm256_or_si256(v, a);
184
185 a = _mm256_load_si256(sp2 + 2);
186 a = _mm256_max_epi32(a, zero);
187 a = _mm256_min_epi32(a, max_val_vec);
188 a = _mm256_slli_epi32(a, 16);
189 v = _mm256_or_si256(v, a);
190 v = _mm256_shuffle_epi8(v, m0);
191
192
193 a = _mm256_load_si256(sp0 + 3);
194 a = _mm256_max_epi32(a, zero);
195 w = _mm256_min_epi32(a, max_val_vec);
196
197 a = _mm256_load_si256(sp1 + 3);
198 a = _mm256_max_epi32(a, zero);
199 a = _mm256_min_epi32(a, max_val_vec);
200 a = _mm256_slli_epi32(a, 8);
201 w = _mm256_or_si256(w, a);
202
203 a = _mm256_load_si256(sp2 + 3);
204 a = _mm256_max_epi32(a, zero);
205 a = _mm256_min_epi32(a, max_val_vec);
206 a = _mm256_slli_epi32(a, 16);
207 w = _mm256_or_si256(w, a);
208 w = _mm256_shuffle_epi8(w, m0);
209
210 _mm_storeu_si128((__m128i*)(p ), _mm256_castsi256_si128(t));
211 _mm_storeu_si128((__m128i*)(p + 12), _mm256_extracti128_si256(t,1));
212 _mm_storeu_si128((__m128i*)(p + 24), _mm256_castsi256_si128(u));
213 _mm_storeu_si128((__m128i*)(p + 36), _mm256_extracti128_si256(u,1));
214 _mm_storeu_si128((__m128i*)(p + 48), _mm256_castsi256_si128(v));
215 _mm_storeu_si128((__m128i*)(p + 60), _mm256_extracti128_si256(v,1));
216 _mm_storeu_si128((__m128i*)(p + 72), _mm256_castsi256_si128(w));
217#ifdef OJPH_ARCH_X86_64
218 *((si64*)(p + 84)) = _mm256_extract_epi64(w, 2);
219#elif (defined OJPH_ARCH_I386)
220 *((si32*)(p + 84)) = _mm256_extract_epi32(w, 4);
221 *((si32*)(p + 88)) = _mm256_extract_epi32(w, 5);
222#else
223 #error Error unsupport compiler
224#endif
225 *((si32*)(p + 92)) = _mm256_extract_epi32(w, 6);
226
227 // this is an alterative slower implementation
228 //__m256i tx, ux, vx, wx;
229 //tx = _mm256_permute2x128_si256(t, v, 0x20);
230 //ux = _mm256_permute2x128_si256(t, v, 0x31);
231 //vx = _mm256_permute2x128_si256(u, w, 0x20);
232 //wx = _mm256_permute2x128_si256(u, w, 0x31);
233
234 //tx = _mm256_or_si256(tx, _mm256_bslli_epi128(ux, 12));
235 //ux = _mm256_or_si256(_mm256_bsrli_epi128(ux, 4),
236 // _mm256_bslli_epi128(vx, 8));
237 //vx = _mm256_or_si256(_mm256_bsrli_epi128(vx, 8),
238 // _mm256_bslli_epi128(wx, 4));
239
240 //a = _mm256_permute2x128_si256(tx, ux, 0x20);
241 //_mm256_storeu_si256(p, a);
242 //a = _mm256_permute2x128_si256(vx, tx, 0x30);
243 //_mm256_storeu_si256(p + 1, a);
244 //a = _mm256_permute2x128_si256(ux, vx, 0x31);
245 //_mm256_storeu_si256(p + 2, a);
246 }
247
248 const si32* ssp0 = (si32*)sp0;
249 const si32* ssp1 = (si32*)sp1;
250 const si32* ssp2 = (si32*)sp2;
251 for ( ; count > 0; --count)
252 {
253 int val;
254 val = *ssp0++;
255 val = val >= 0 ? val : 0;
256 val = val <= max_val ? val : max_val;
257 *p++ = (ui8) val;
258 val = *ssp1++;
259 val = val >= 0 ? val : 0;
260 val = val <= max_val ? val : max_val;
261 *p++ = (ui8) val;
262 val = *ssp2++;
263 val = val >= 0 ? val : 0;
264 val = val <= max_val ? val : max_val;
265 *p++ = (ui8) val;
266 }
267 }
268
270 void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
271 const line_buf *ln2, void *dp,
272 ui32 bit_depth, ui32 count)
273 {
274 ojph_unused(ln1);
275 ojph_unused(ln2);
276
277 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
278 __m256i zero = _mm256_setzero_si256();
279 __m256i mask = _mm256_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100,
280 0x0F0E0B0A07060302, 0x0D0C090805040100);
281 const si32 *sp = ln0->i32;
282 ui16* p = (ui16 *)dp;
283
284 // 16 entries in each loop
285 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
286 {
287 __m256i a, t;
288 a = _mm256_load_si256((__m256i*)sp);
289 a = _mm256_max_epi32(a, zero);
290 t = _mm256_min_epi32(a, max_val_vec);
291
292 a = _mm256_load_si256((__m256i*)sp + 1);
293 a = _mm256_max_epi32(a, zero);
294 a = _mm256_min_epi32(a, max_val_vec);
295 a = _mm256_slli_epi32(a, 16);
296 t = _mm256_or_si256(t, a);
297
298 t = _mm256_shuffle_epi8(t, mask);
299 t = _mm256_permute4x64_epi64(t, 0xD8);
300 _mm256_storeu_si256((__m256i*)p, t);
301 }
302
303 int max_val = (1<<bit_depth) - 1;
304 for ( ; count > 0; --count)
305 {
306 int val = *sp++;
307 val = val >= 0 ? val : 0;
308 val = val <= max_val ? val : max_val;
309 *p++ = (ui16) val;
310 }
311 }
312
314 void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
315 const line_buf *ln2, void *dp,
316 ui32 bit_depth, ui32 count)
317 {
318 ojph_unused(ln1);
319 ojph_unused(ln2);
320
321 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
322 __m256i zero = _mm256_setzero_si256();
323 __m256i mask = _mm256_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001,
324 0x0E0F0A0B06070203, 0x0C0D080904050001);
325 const si32 *sp = ln0->i32;
326 ui16* p = (ui16 *)dp;
327
328 // 16 entries in each loop
329 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
330 {
331 __m256i a, t;
332 a = _mm256_load_si256((__m256i*)sp);
333 a = _mm256_max_epi32(a, zero);
334 t = _mm256_min_epi32(a, max_val_vec);
335
336 a = _mm256_load_si256((__m256i*)sp + 1);
337 a = _mm256_max_epi32(a, zero);
338 a = _mm256_min_epi32(a, max_val_vec);
339 a = _mm256_slli_epi32(a, 16);
340 t = _mm256_or_si256(t, a);
341
342 t = _mm256_shuffle_epi8(t, mask);
343 t = _mm256_permute4x64_epi64(t, 0xD8);
344 _mm256_storeu_si256((__m256i*)p, t);
345 }
346
347 int max_val = (1<<bit_depth) - 1;
348 for ( ; count > 0; --count)
349 {
350 int val = *sp++;
351 val = val >= 0 ? val : 0;
352 val = val <= max_val ? val : max_val;
353 *p++ = be2le((ui16) val);
354 }
355 }
356}
357
358#endif
int64_t si64
Definition ojph_defs.h:57
void avx2_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
uint16_t ui16
Definition ojph_defs.h:52
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
void avx2_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
static ui16 be2le(const ui16 v)
void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, ui32 bit_depth, ui32 count)
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define ojph_unused(x)
Definition ojph_defs.h:78