OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_encoder_avx512.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2023, Intel Corporation
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_block_encoder_avx512.cpp
35//***************************************************************************/
36
37#include "ojph_arch.h"
38#if defined(OJPH_ARCH_X86_64)
39
40#include <cassert>
41#include <cstring>
42#include <cstdint>
43#include <climits>
44#include <immintrin.h>
45
46#include "ojph_mem.h"
47#include "ojph_block_encoder.h"
48#include "ojph_message.h"
49
50#ifdef OJPH_COMPILER_MSVC
51 #define likely(x) (x)
52 #define unlikely(x) (x)
53#else
54 #define likely(x) __builtin_expect((x), 1)
55 #define unlikely(x) __builtin_expect((x), 0)
56#endif
57
58namespace ojph {
59 namespace local {
60
62 // tables
64
65 //VLC encoding
66 // index is (c_q << 8) + (rho << 4) + eps
67 // data is (cwd << 8) + (cwd_len << 4) + eps
68 // table 0 is for the initial line of quads
69 static ui32 vlc_tbl0[2048];
70 static ui32 vlc_tbl1[2048];
71
72 //UVLC encoding
73 static ui32 ulvc_cwd_pre[33];
74 static int ulvc_cwd_pre_len[33];
75 static ui32 ulvc_cwd_suf[33];
76 static int ulvc_cwd_suf_len[33];
77
79 static bool vlc_init_tables()
80 {
81 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
82 vlc_src_table tbl0[] = {
83 #include "table0.h"
84 };
85 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
86
87 si32 pattern_popcnt[16];
88 for (ui32 i = 0; i < 16; ++i)
89 pattern_popcnt[i] = (si32)population_count(i);
90
91 vlc_src_table* src_tbl = tbl0;
92 ui32 *tgt_tbl = vlc_tbl0;
93 size_t tbl_size = tbl0_size;
94 for (int i = 0; i < 2048; ++i)
95 {
96 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
97 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
98 tgt_tbl[i] = 0;
99 else
100 {
101 vlc_src_table *best_entry = NULL;
102 if (emb) // u_off = 1
103 {
104 int best_e_k = -1;
105 for (size_t j = 0; j < tbl_size; ++j)
106 {
107 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
108 if (src_tbl[j].u_off == 1)
109 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
110 {
111 //now we need to find the smallest cwd with the highest
112 // number of bits set in e_k
113 int ones_count = pattern_popcnt[src_tbl[j].e_k];
114 if (ones_count >= best_e_k)
115 {
116 best_entry = src_tbl + j;
117 best_e_k = ones_count;
118 }
119 }
120 }
121 }
122 else // u_off = 0
123 {
124 for (size_t j = 0; j < tbl_size; ++j)
125 {
126 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
127 if (src_tbl[j].u_off == 0)
128 {
129 best_entry = src_tbl + j;
130 break;
131 }
132 }
133 }
134 assert(best_entry);
135 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
136 + best_entry->e_k);
137 }
138 }
139
140 vlc_src_table tbl1[] = {
141 #include "table1.h"
142 };
143 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
144
145 src_tbl = tbl1;
146 tgt_tbl = vlc_tbl1;
147 tbl_size = tbl1_size;
148 for (int i = 0; i < 2048; ++i)
149 {
150 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
151 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
152 tgt_tbl[i] = 0;
153 else
154 {
155 vlc_src_table *best_entry = NULL;
156 if (emb) // u_off = 1
157 {
158 int best_e_k = -1;
159 for (size_t j = 0; j < tbl_size; ++j)
160 {
161 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
162 if (src_tbl[j].u_off == 1)
163 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
164 {
165 //now we need to find the smallest cwd with the highest
166 // number of bits set in e_k
167 int ones_count = pattern_popcnt[src_tbl[j].e_k];
168 if (ones_count >= best_e_k)
169 {
170 best_entry = src_tbl + j;
171 best_e_k = ones_count;
172 }
173 }
174 }
175 }
176 else // u_off = 0
177 {
178 for (size_t j = 0; j < tbl_size; ++j)
179 {
180 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
181 if (src_tbl[j].u_off == 0)
182 {
183 best_entry = src_tbl + j;
184 break;
185 }
186 }
187 }
188 assert(best_entry);
189 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
190 + best_entry->e_k);
191 }
192 }
193
194
195 return true;
196 }
197
199 static bool uvlc_init_tables()
200 {
201 //code goes from 0 to 31, extension and 32 are not supported here
202 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
203 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
204 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
205 ulvc_cwd_pre_len[2] = 2;
206 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
207 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
208 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
209 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
210 ulvc_cwd_suf_len[2] = 0;
211 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
212 for (int i = 5; i < 33; ++i)
213 {
214 ulvc_cwd_pre[i] = 0;
215 ulvc_cwd_pre_len[i] = 3;
216 ulvc_cwd_suf[i] = (ui32)(i-5);
217 ulvc_cwd_suf_len[i] = 5;
218 }
219 return true;
220 }
221
223 static bool tables_initialized = false;
224
227 if (!tables_initialized) {
228 memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
229 memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
232 }
233 return tables_initialized;
234 }
235
237 //
239 struct mel_struct {
240 //storage
241 ui8* buf; //pointer to data buffer
242 ui32 pos; //position of next writing within buf
243 ui32 buf_size; //size of buffer, which we must not exceed
244
245 // all these can be replaced by bytes
246 int remaining_bits; //number of empty bits in tmp
247 int tmp; //temporary storage of coded bits
248 int run; //number of 0 run
249 int k; //state
250 int threshold; //threshold where one bit must be coded
251 };
252
254 static inline void
255 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
256 {
257 melp->buf = data;
258 melp->pos = 0;
259 melp->buf_size = buffer_size;
260 melp->remaining_bits = 8;
261 melp->tmp = 0;
262 melp->run = 0;
263 melp->k = 0;
264 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
265 }
266
268 static inline void
269 mel_emit_bit(mel_struct* melp, int v)
270 {
271 melp->tmp = (melp->tmp << 1) + v;
272 melp->remaining_bits--;
273 if (melp->remaining_bits == 0) {
274 melp->buf[melp->pos++] = (ui8)melp->tmp;
275 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
276 melp->tmp = 0;
277 }
278 }
279
281 static inline void
282 mel_encode(mel_struct* melp, bool bit)
283 {
284 //MEL exponent
285 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
286
287 if (bit == false) {
288 ++melp->run;
289 if (melp->run >= melp->threshold) {
290 mel_emit_bit(melp, 1);
291 melp->run = 0;
292 melp->k = ojph_min(12, melp->k + 1);
293 melp->threshold = 1 << mel_exp[melp->k];
294 }
295 } else {
296 mel_emit_bit(melp, 0);
297 int t = mel_exp[melp->k];
298 while (t > 0) {
299 mel_emit_bit(melp, (melp->run >> --t) & 1);
300 }
301 melp->run = 0;
302 melp->k = ojph_max(0, melp->k - 1);
303 melp->threshold = 1 << mel_exp[melp->k];
304 }
305 }
306
308 //
310 struct vlc_struct_avx512 {
311 //storage
312 ui8* buf; //pointer to data buffer
313 ui32 pos; //position of next writing within buf
314 ui32 buf_size; //size of buffer, which we must not exceed
315
316 int used_bits; //number of occupied bits in tmp
317 ui64 tmp; //temporary storage of coded bits
318 bool last_greater_than_8F; //true if last byte us greater than 0x8F
319 };
320
322 static inline void
323 vlc_init(vlc_struct_avx512* vlcp, ui32 buffer_size, ui8* data)
324 {
325 vlcp->buf = data + buffer_size - 1; //points to last byte
326 vlcp->pos = 1; //locations will be all -pos
327 vlcp->buf_size = buffer_size;
328
329 vlcp->buf[0] = 0xFF;
330 vlcp->used_bits = 4;
331 vlcp->tmp = 0xF;
332 vlcp->last_greater_than_8F = true;
333 }
334
336 static inline void
337 vlc_encode(vlc_struct_avx512* vlcp, ui32 cwd, int cwd_len)
338 {
339 vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
340 vlcp->used_bits += cwd_len;
341
342 while (vlcp->used_bits >= 8) {
343 ui8 tmp;
344
345 if (unlikely(vlcp->last_greater_than_8F)) {
346 tmp = vlcp->tmp & 0x7F;
347
348 if (likely(tmp != 0x7F)) {
349 tmp = vlcp->tmp & 0xFF;
350 *(vlcp->buf - vlcp->pos) = tmp;
351 vlcp->last_greater_than_8F = tmp > 0x8F;
352 vlcp->tmp >>= 8;
353 vlcp->used_bits -= 8;
354 } else {
355 *(vlcp->buf - vlcp->pos) = tmp;
356 vlcp->last_greater_than_8F = false;
357 vlcp->tmp >>= 7;
358 vlcp->used_bits -= 7;
359 }
360
361 } else {
362 tmp = vlcp->tmp & 0xFF;
363 *(vlcp->buf - vlcp->pos) = tmp;
364 vlcp->last_greater_than_8F = tmp > 0x8F;
365 vlcp->tmp >>= 8;
366 vlcp->used_bits -= 8;
367 }
368
369 vlcp->pos++;
370 }
371 }
372
374 //
376 static inline void
377 terminate_mel_vlc(mel_struct* melp, vlc_struct_avx512* vlcp)
378 {
379 if (melp->run > 0)
380 mel_emit_bit(melp, 1);
381
382 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
383 *(vlcp->buf - vlcp->pos) = 0x7f;
384 vlcp->pos++;
385 vlcp->tmp >>= 7;
386 vlcp->used_bits -= 7;
387 }
388
389 melp->tmp = melp->tmp << melp->remaining_bits;
390 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
391 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
392 if ((mel_mask | vlc_mask) == 0)
393 return; //last mel byte cannot be 0xFF, since then
394 //melp->remaining_bits would be < 8
395 if (melp->pos >= melp->buf_size)
396 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
397 ui8 vlcp_tmp = (ui8)vlcp->tmp;
398 int fuse = melp->tmp | vlcp_tmp;
399 if ( ( ((fuse ^ melp->tmp) & mel_mask)
400 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
401 && (fuse != 0xFF) && vlcp->pos > 1)
402 {
403 melp->buf[melp->pos++] = (ui8)fuse;
404 }
405 else
406 {
407 if (vlcp->pos >= vlcp->buf_size)
408 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
409 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
410 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
411 vlcp->pos++;
412 }
413 }
414
416//
418 struct ms_struct {
419 //storage
420 ui8* buf; //pointer to data buffer
421 ui32 pos; //position of next writing within buf
422 ui32 buf_size; //size of buffer, which we must not exceed
423
424 int max_bits; //maximum number of bits that can be store in tmp
425 int used_bits; //number of occupied bits in tmp
426 ui32 tmp; //temporary storage of coded bits
427 };
428
430 static inline void
431 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
432 {
433 msp->buf = data;
434 msp->pos = 0;
435 msp->buf_size = buffer_size;
436 msp->max_bits = 8;
437 msp->used_bits = 0;
438 msp->tmp = 0;
439 }
440
442 static inline void
443 ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
444 {
445 while (cwd_len > 0)
446 {
447 if (msp->pos >= msp->buf_size)
448 OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
449 int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
450 msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
451 msp->used_bits += t;
452 cwd >>= t;
453 cwd_len -= t;
454 if (msp->used_bits >= msp->max_bits)
455 {
456 msp->buf[msp->pos++] = (ui8)msp->tmp;
457 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
458 msp->tmp = 0;
459 msp->used_bits = 0;
460 }
461 }
462 }
463
465 static inline void
467 {
468 if (msp->used_bits)
469 {
470 int t = msp->max_bits - msp->used_bits; //unused bits
471 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
472 msp->used_bits += t;
473 if (msp->tmp != 0xFF)
474 {
475 if (msp->pos >= msp->buf_size)
476 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
477 msp->buf[msp->pos++] = (ui8)msp->tmp;
478 }
479 }
480 else if (msp->max_bits == 7)
481 msp->pos--;
482 }
483
484#define ZERO _mm512_setzero_epi32()
485#define ONE _mm512_set1_epi32(1)
486
487#if 0
488static void print_epi32(const char *msg, __m512i &val)
489{
490 uint32_t A[16] = {0};
491
492 _mm512_store_epi32(A, val);
493
494 printf("%s: ", msg);
495 for (int i = 0; i < 16; ++i) {
496 printf("%X ", A[i]);
497 }
498 printf("\n");
499}
500#endif
501
502static void proc_pixel(__m512i *src_vec, ui32 p,
503 __m512i *eq_vec, __m512i *s_vec,
504 __m512i &rho_vec, __m512i &e_qmax_vec)
505{
506 __m512i val_vec[4];
507 __m512i _eq_vec[4];
508 __m512i _s_vec[4];
509 __m512i _rho_vec[4];
510 ui16 val_mask[4];
511
512 for (ui32 i = 0; i < 4; ++i) {
513 /* val = t + t; //multiply by 2 and get rid of sign */
514 val_vec[i] = _mm512_add_epi32(src_vec[i], src_vec[i]);
515
516 /* val >>= p; // 2 \mu_p + x */
517 val_vec[i] = _mm512_srli_epi32(val_vec[i], p);
518
519 /* val &= ~1u; // 2 \mu_p */
520 val_vec[i] = _mm512_and_epi32(val_vec[i], _mm512_set1_epi32((int)~1u));
521
522 /* if (val) { */
523 val_mask[i] = _mm512_cmpneq_epi32_mask(val_vec[i], ZERO);
524
525 /* rho[i] = 1 << i;
526 * rho is processed below.
527 */
528
529 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
530 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
531 _eq_vec[i] = _mm512_mask_lzcnt_epi32(ZERO, val_mask[i], val_vec[i]);
532 _eq_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i],
533 _mm512_set1_epi32(32), _eq_vec[i]);
534
535 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
536 * e_qmax is processed below
537 */
538
539 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
540 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
541 _s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
542 _s_vec[i] =
543 _mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
544 /* } */
545 }
546
547 val_vec[0] = _mm512_mask_mov_epi32(ZERO, val_mask[0], ONE);
548 val_vec[1] = _mm512_mask_mov_epi32(ZERO, val_mask[1], ONE);
549 val_vec[2] = _mm512_mask_mov_epi32(ZERO, val_mask[2], ONE);
550 val_vec[3] = _mm512_mask_mov_epi32(ZERO, val_mask[3], ONE);
551 e_qmax_vec = ZERO;
552
553 const __m512i idx[2] = {
554 _mm512_set_epi32(14, 12, 10, 8, 6, 4, 2, 0, 14, 12, 10, 8, 6, 4, 2, 0),
555 _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 15, 13, 11, 9, 7, 5, 3, 1),
556 };
557
558 /* Reorder from
559 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5]...[0,14], [0,15]
560 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5]...[1,14], [1,15]
561 * *_vec[2]:[0,16], [0,17], [0,18], [0,19], [0,20], [0,21]...[0,30], [0,31]
562 * *_vec[3]:[1,16], [1,17], [1,18], [1,19], [1,20], [1,21]...[1,30], [1,31]
563 * to
564 * *_vec[0]:[0, 0], [0, 2] ... [0,14], [0,16], [0,18] ... [0,30]
565 * *_vec[1]:[1, 0], [1, 2] ... [1,14], [1,16], [1,18] ... [1,30]
566 * *_vec[2]:[0, 1], [0, 3] ... [0,15], [0,17], [0,19] ... [0,31]
567 * *_vec[3]:[1, 1], [1, 3] ... [1,15], [1,17], [1,19] ... [1,31]
568 */
569 for (ui32 i = 0; i < 4; ++i) {
570 ui32 e_idx = i >> 1;
571 ui32 o_idx = i & 0x1;
572
573 eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
574 eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
575 idx[e_idx],
576 _eq_vec[o_idx + 2]);
577
578 s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
579 s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
580 idx[e_idx],
581 _s_vec[o_idx + 2]);
582
583 _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
584 _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
585 idx[e_idx],
586 val_vec[o_idx + 2]);
587 _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
588
589 e_qmax_vec = _mm512_max_epi32(e_qmax_vec, eq_vec[i]);
590 }
591
592 rho_vec = _mm512_or_epi32(_rho_vec[0], _rho_vec[1]);
593 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[2]);
594 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[3]);
595}
596
597/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
598 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
599 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
600 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
601 *
602 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
603 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
604 *
605 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
606 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
607 *
608 * [..]
609 */
610static void rotate_matrix(__m512i *matrix)
611{
612 __m512i _matrix[4];
613 _matrix[0] = _mm512_unpacklo_epi32(matrix[0], matrix[1]);
614 _matrix[1] = _mm512_unpackhi_epi32(matrix[0], matrix[1]);
615 _matrix[2] = _mm512_unpacklo_epi32(matrix[2], matrix[3]);
616 _matrix[3] = _mm512_unpackhi_epi32(matrix[2], matrix[3]);
617
618 matrix[0] = _mm512_unpacklo_epi64(_matrix[0], _matrix[2]);
619 matrix[1] = _mm512_unpackhi_epi64(_matrix[0], _matrix[2]);
620 matrix[2] = _mm512_unpacklo_epi64(_matrix[1], _matrix[3]);
621 matrix[3] = _mm512_unpackhi_epi64(_matrix[1], _matrix[3]);
622
623 _matrix[0] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0x88);
624 _matrix[1] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0x88);
625 _matrix[2] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0xDD);
626 _matrix[3] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0xDD);
627
628 matrix[0] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0x88);
629 matrix[1] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0x88);
630 matrix[2] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0xDD);
631 matrix[3] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0xDD);
632}
633
634static void proc_ms_encode(ms_struct *msp,
635 __m512i &tuple_vec,
636 __m512i &uq_vec,
637 __m512i &rho_vec,
638 __m512i *s_vec)
639{
640 __m512i m_vec[4];
641
642 /* Prepare parameters for ms_encode */
643 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
644 auto tmp = _mm512_and_epi32(tuple_vec, ONE);
645 tmp = _mm512_sub_epi32(uq_vec, tmp);
646 auto tmp1 = _mm512_and_epi32(rho_vec, ONE);
647 auto mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
648 m_vec[0] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
649
650 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
651 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(2));
652 tmp = _mm512_srli_epi32(tmp, 1);
653 tmp = _mm512_sub_epi32(uq_vec, tmp);
654 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
655 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
656 m_vec[1] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
657
658 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
659 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(4));
660 tmp = _mm512_srli_epi32(tmp, 2);
661 tmp = _mm512_sub_epi32(uq_vec, tmp);
662 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
663 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
664 m_vec[2] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
665
666 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
667 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(8));
668 tmp = _mm512_srli_epi32(tmp, 3);
669 tmp = _mm512_sub_epi32(uq_vec, tmp);
670 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
671 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
672 m_vec[3] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
673
674 rotate_matrix(m_vec);
675 /* s_vec from
676 * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
677 * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
678 * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
679 * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
680 * to
681 * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
682 * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
683 * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
684 * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
685 */
686 rotate_matrix(s_vec);
687
688 ui32 cwd[16];
689 int cwd_len[16];
690 ui64 _cwd = 0;
691 int _cwd_len = 0;
692
693 /* Each iteration process 8 bytes * 2 lines */
694 for (ui32 i = 0; i < 4; ++i) {
695 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
696 * cwd_len = m
697 */
698 _mm512_storeu_si512(cwd_len, m_vec[i]);
699 tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
700 tmp = _mm512_sub_epi32(tmp, ONE);
701 tmp = _mm512_and_epi32(tmp, s_vec[i]);
702 _mm512_storeu_si512(cwd, tmp);
703
704 for (ui32 j = 0; j < 8; ++j) {
705 ui32 idx = j * 2;
706 _cwd = cwd[idx];
707 _cwd_len = cwd_len[idx];
708 _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len;
709 _cwd_len += cwd_len[idx + 1];
710 ms_encode(msp, _cwd, _cwd_len);
711 }
712 }
713}
714
715static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
716 __m512i &e_qmax_vec)
717{
718 /* if (u_q[i] > 0) {
719 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
720 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
721 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
722 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
723 * }
724 */
725 auto u_q_mask = _mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
726
727 auto mask = _mm512_cmpeq_epi32_mask(eq_vec[0], e_qmax_vec);
728 auto tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
729 auto eps_vec = _mm512_mask_mov_epi32(ZERO, u_q_mask, tmp);
730
731 mask = _mm512_cmpeq_epi32_mask(eq_vec[1], e_qmax_vec);
732 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
733 tmp = _mm512_slli_epi32(tmp, 1);
734 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
735
736 mask = _mm512_cmpeq_epi32_mask(eq_vec[2], e_qmax_vec);
737 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
738 tmp = _mm512_slli_epi32(tmp, 2);
739 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
740
741 mask = _mm512_cmpeq_epi32_mask(eq_vec[3], e_qmax_vec);
742 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
743 tmp = _mm512_slli_epi32(tmp, 3);
744
745 return _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
746}
747
748static void update_lep(ui32 x, __m512i &prev_e_val_vec,
749 __m512i *eq_vec, __m512i *e_val_vec,
750 const __m512i left_shift)
751{
752 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
753 * lep[0] = (ui8)e_q[3];
754 * Compare e_q[1] with e_q[3] of the prevous round.
755 */
756 auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
757 left_shift, eq_vec[3]);
758 prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
759 eq_vec[3]);
760 e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
761}
762
763
764static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
765 __m512i &rho_vec, __m512i *cx_val_vec,
766 const __m512i left_shift)
767{
768 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
769 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
770 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
771 */
772 auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
773 left_shift, rho_vec);
774 prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
775 rho_vec);
776
777 tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
778 tmp = _mm512_srli_epi32(tmp, 3);
779
780 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
781 tmp1 = _mm512_srli_epi32(tmp1, 1);
782 cx_val_vec[x] = _mm512_or_epi32(tmp, tmp1);
783}
784
785static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
786 __m512i &eps_vec, ui32 *vlc_tbl)
787{
788 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
789 auto tmp = _mm512_slli_epi32(cq_vec, 8);
790 auto tmp1 = _mm512_slli_epi32(rho_vec, 4);
791 tmp = _mm512_add_epi32(tmp, tmp1);
792 tmp = _mm512_add_epi32(tmp, eps_vec);
793 return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
794}
795
796static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
797 const __m512i right_shift)
798{
799 ojph_unused(x);
800 ojph_unused(cx_val_vec);
801 ojph_unused(right_shift);
802
803 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
804 auto tmp = _mm512_srli_epi32(rho_vec, 1);
805 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(1));
806 return _mm512_or_epi32(tmp, tmp1);
807}
808
809static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
810 const __m512i right_shift)
811{
812 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
813 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
814 auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
815 auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
816 auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
817 tmp = _mm512_mask_permutexvar_epi32(tmp, 0xC000, right_shift, lcxp2_vec);
818 tmp = _mm512_slli_epi32(tmp, 2);
819 auto tmp1 = _mm512_mask_mov_epi32(lcxp1_vec, 0x8000, lcxp2_vec);
820 tmp = _mm512_add_epi32(tmp1, tmp);
821
822 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
823 tmp1 = _mm512_srli_epi32(tmp1, 1);
824 tmp = _mm512_or_epi32(tmp, tmp1);
825
826 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
827 tmp1 = _mm512_srli_epi32(tmp1, 2);
828
829 return _mm512_or_epi32(tmp, tmp1);
830}
831
832using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
833
834static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
835 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
836 const __m512i right_shift)
837{
838 /* Prepare mel_encode params */
839 /* if (c_q[i] == 0) { */
840 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
841 /* mel_encode(&mel, rho[i] != 0); */
842 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
843 /* } */
844
845 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
846 auto tmp = _mm512_permutexvar_epi32(right_shift, u_q_vec);
847 auto tmp1 = _mm512_min_epi32(u_q_vec, tmp);
848 auto mel_bit2 = (ui16)_mm512_cmpgt_epi32_mask(tmp1, _mm512_set1_epi32(2));
849
850 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
851 auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
852 mel_need_encode2 =
853 mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
854
855 ui32 i_max = 16 - (ignore / 2);
856
857 for (ui32 i = 0; i < i_max; i += 2) {
858 auto mask = 1 << i;
859 if (0 != (mel_need_encode & mask)) {
860 mel_encode(melp, mel_bit & mask);
861 }
862
863 if (i + 1 < i_max) {
864 auto mask = 1 << (i + 1);
865 if (0 != (mel_need_encode & mask)) {
866 mel_encode(melp, mel_bit & mask);
867 }
868 }
869
870 if (0 != (mel_need_encode2 & mask)) {
871 mel_encode(melp, mel_bit2 & mask);
872 }
873 }
874}
875
876static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
877 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
878 const __m512i right_shift)
879{
880 ojph_unused(u_q_vec);
881 ojph_unused(right_shift);
882
883 /* Prepare mel_encode params */
884 /* if (c_q[i] == 0) { */
885 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
886 /* mel_encode(&mel, rho[i] != 0); */
887 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
888 /* } */
889
890 ui32 i_max = 16 - (ignore / 2);
891
892 for (ui32 i = 0; i < i_max; ++i) {
893 auto mask = 1 << i;
894 if (0 != (mel_need_encode & mask)) {
895 mel_encode(melp, mel_bit & mask);
896 }
897 }
898}
899
900using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
901 __m512i, ui32, const __m512i);
902
903static void proc_vlc_encode1(vlc_struct_avx512 *vlcp, ui32 *tuple,
904 ui32 *u_q, ui32 ignore)
905{
906 ui32 i_max = 16 - (ignore / 2);
907
908 for (ui32 i = 0; i < i_max; i += 2) {
909 /* 7 bits */
910 ui32 val = tuple[i + 0] >> 4;
911 int size = tuple[i + 0] & 7;
912
913 if (i + 1 < i_max) {
914 /* 7 bits */
915 val |= (tuple[i + 1] >> 4) << size;
916 size += tuple[i + 1] & 7;
917 }
918
919 if (u_q[i] > 2 && u_q[i + 1] > 2) {
920 /* 3 bits */
921 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
922 size += ulvc_cwd_pre_len[u_q[i] - 2];
923
924 /* 3 bits */
925 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
926 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
927
928 /* 5 bits */
929 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
930 size += ulvc_cwd_suf_len[u_q[i] - 2];
931
932 /* 5 bits */
933 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
934 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
935
936 } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
937 /* 3 bits */
938 val |= (ulvc_cwd_pre[u_q[i]]) << size;
939 size += ulvc_cwd_pre_len[u_q[i]];
940
941 /* 1 bit */
942 val |= (u_q[i + 1] - 1) << size;
943 size += 1;
944
945 /* 5 bits */
946 val |= (ulvc_cwd_suf[u_q[i]]) << size;
947 size += ulvc_cwd_suf_len[u_q[i]];
948
949 } else {
950 /* 3 bits */
951 val |= (ulvc_cwd_pre[u_q[i]]) << size;
952 size += ulvc_cwd_pre_len[u_q[i]];
953
954 /* 3 bits */
955 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
956 size += ulvc_cwd_pre_len[u_q[i + 1]];
957
958 /* 5 bits */
959 val |= (ulvc_cwd_suf[u_q[i]]) << size;
960 size += ulvc_cwd_suf_len[u_q[i]];
961
962 /* 5 bits */
963 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
964 size += ulvc_cwd_suf_len[u_q[i + 1]];
965 }
966
967 vlc_encode(vlcp, val, size);
968 }
969}
970
971static void proc_vlc_encode2(vlc_struct_avx512 *vlcp, ui32 *tuple,
972 ui32 *u_q, ui32 ignore)
973{
974 ui32 i_max = 16 - (ignore / 2);
975
976 for (ui32 i = 0; i < i_max; i += 2) {
977 /* 7 bits */
978 ui32 val = tuple[i + 0] >> 4;
979 int size = tuple[i + 0] & 7;
980
981 if (i + 1 < i_max) {
982 /* 7 bits */
983 val |= (tuple[i + 1] >> 4) << size;
984 size += tuple[i + 1] & 7;
985 }
986
987 /* 3 bits */
988 val |= ulvc_cwd_pre[u_q[i]] << size;
989 size += ulvc_cwd_pre_len[u_q[i]];
990
991 /* 3 bits */
992 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
993 size += ulvc_cwd_pre_len[u_q[i + 1]];
994
995 /* 5 bits */
996 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
997 size += ulvc_cwd_suf_len[u_q[i + 0]];
998
999 /* 5 bits */
1000 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
1001 size += ulvc_cwd_suf_len[u_q[i + 1]];
1002
1003 vlc_encode(vlcp, val, size);
1004 }
1005}
1006
1007using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *, ui32 *, ui32 *, ui32);
1008
1009void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
1010 ui32 num_passes, ui32 _width, ui32 height,
1011 ui32 stride, ui32* lengths,
1012 ojph::mem_elastic_allocator *elastic,
1013 ojph::coded_lists *& coded)
1014{
1015 ojph_unused(num_passes); //currently not used
1016
1017 ui32 width = (_width + 31) & ~31u;
1018 ui32 ignore = width - _width;
1019 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1020 const int mel_vlc_size = 3072; //more than enough
1021 const int mel_size = 192;
1022 const int vlc_size = mel_vlc_size - mel_size;
1023
1024 ui8 ms_buf[ms_size];
1025 ui8 mel_vlc_buf[mel_vlc_size];
1026 ui8 *mel_buf = mel_vlc_buf;
1027 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1028
1029 mel_struct mel;
1030 mel_init(&mel, mel_size, mel_buf);
1031 vlc_struct_avx512 vlc;
1032 vlc_init(&vlc, vlc_size, vlc_buf);
1033 ms_struct ms;
1034 ms_init(&ms, ms_size, ms_buf);
1035
1036 ui32 p = 30 - missing_msbs;
1037
1038 //e_val: E values for a line (these are the highest set bit)
1039 //cx_val: is the context values
1040 //Each byte stores the info for the 2 sample. For E, it is maximum
1041 // of the two samples, while for cx, it is the OR of these two samples.
1042 //The maximum is between the pixel at the bottom left of one quad
1043 // and the bottom right of the earlier quad. The same is true for cx.
1044 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1045 // one for the non-existing earlier quad, and one for beyond the
1046 // the end
1047 const __m512i right_shift = _mm512_set_epi32(
1048 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
1049 );
1050
1051 const __m512i left_shift = _mm512_set_epi32(
1052 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15
1053 );
1054
1055 __m512i e_val_vec[33];
1056 for (ui32 i = 0; i < 32; ++i) {
1057 e_val_vec[i] = ZERO;
1058 }
1059 __m512i prev_e_val_vec = ZERO;
1060
1061 __m512i cx_val_vec[33];
1062 __m512i prev_cx_val_vec = ZERO;
1063
1064 __m512i prev_cq_vec = ZERO;
1065
1066 __m512i tmp;
1067 __m512i tmp1;
1068
1069 __m512i eq_vec[4];
1070 __m512i s_vec[4];
1071 __m512i src_vec[4];
1072 __m512i rho_vec;
1073 __m512i e_qmax_vec;
1074 __m512i kappa_vec;
1075
1076 ui32 n_loop = (width + 31) / 32;
1077
1078 ui32 *vlc_tbl = vlc_tbl0;
1079 fn_proc_cq proc_cq = proc_cq1;
1080 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1081 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1082
1083 /* 2 lines per iteration */
1084 for (ui32 y = 0; y < height; y += 2)
1085 {
1086 e_val_vec[n_loop] = prev_e_val_vec;
1087 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1088 tmp = _mm512_and_epi32(prev_cx_val_vec, _mm512_set1_epi32(8));
1089 tmp = _mm512_srli_epi32(tmp, 3);
1090 cx_val_vec[n_loop] = tmp;
1091
1092 prev_e_val_vec = ZERO;
1093 prev_cx_val_vec = ZERO;
1094
1095 ui32 *sp = buf + y * stride;
1096
1097 /* 32 bytes per iteration */
1098 for (ui32 x = 0; x < n_loop; ++x) {
1099
1100 // mask to stop loading unnecessary data
1101 si32 true_x = (si32)x << 5;
1102 ui32 mask32 = 0xFFFFFFFFu;
1103 si32 entries = true_x + 32 - (si32)_width;
1104 mask32 >>= ((entries >= 0) ? entries : 0);
1105 __mmask16 load_mask0 = _cvtu32_mask16(mask32);
1106 __mmask16 load_mask1 = _cvtu32_mask16(mask32 >> 16);
1107
1108 /* t = sp[i]; */
1109 src_vec[0] = _mm512_maskz_loadu_epi32(load_mask0, sp);
1110 src_vec[2] = _mm512_maskz_loadu_epi32(load_mask1, sp + 16);
1111
1112 if (y + 1 < height) {
1113 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1114 src_vec[3] =
1115 _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
1116 } else {
1117 src_vec[1] = ZERO;
1118 src_vec[3] = ZERO;
1119 }
1120 sp += 32;
1121
1122 /* src_vec layout:
1123 * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5]...[0,15]
1124 * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5]...[1,15]
1125 * src_vec[2]:[0,16],[0,17],[0,18],[0,19],[0,20],[0,21]...[0,31]
1126 * src_vec[3]:[1,16],[1,17],[1,18],[1,19],[1,20],[1,21]...[1,31]
1127 */
1128 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1129
1130 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1131 tmp = _mm512_permutexvar_epi32(right_shift, e_val_vec[x]);
1132 tmp = _mm512_mask_permutexvar_epi32(tmp, 0x8000, right_shift,
1133 e_val_vec[x + 1]);
1134 auto mask = _mm512_cmpgt_epi32_mask(e_val_vec[x], tmp);
1135 auto max_e_vec = _mm512_mask_mov_epi32(tmp, mask, e_val_vec[x]);
1136 max_e_vec = _mm512_sub_epi32(max_e_vec, ONE);
1137
1138 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1139 tmp = _mm512_max_epi32(max_e_vec, ONE);
1140 tmp1 = _mm512_sub_epi32(rho_vec, ONE);
1141 tmp1 = _mm512_and_epi32(rho_vec, tmp1);
1142 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
1143 kappa_vec = _mm512_mask_mov_epi32(ONE, mask, tmp);
1144
1145 /* cq[1 - 16] = cq_vec
1146 * cq[0] = prev_cq_vec[0]
1147 */
1148 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1149 auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
1150 left_shift, tmp);
1151 prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
1152 tmp);
1153
1154 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1155 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1156
1157 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1158 /* u_q[i] = Uq[i] - kappa[i]; */
1159 auto uq_vec = _mm512_max_epi32(kappa_vec, e_qmax_vec);
1160 auto u_q_vec = _mm512_sub_epi32(uq_vec, kappa_vec);
1161
1162 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1163 __m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1164 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1165
1166 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1167 right_shift);
1168
1169 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1170
1171 // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
1172 // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
1173 ui32 u_q[16];
1174 ui32 tuple[16];
1175 /* The tuple is scaled by 4 due to:
1176 * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
1177 * So in the vlc_encode, the tuple will only be scaled by 2.
1178 */
1179 tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1180 _mm512_storeu_si512(tuple, tuple_vec);
1181 _mm512_storeu_si512(u_q, u_q_vec);
1182 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1183 }
1184
1185 tmp = _mm512_permutexvar_epi32(right_shift, cx_val_vec[0]);
1186 tmp = _mm512_slli_epi32(tmp, 2);
1187 prev_cq_vec = _mm512_maskz_add_epi32(0x1, tmp, cx_val_vec[0]);
1188
1189 proc_cq = proc_cq2;
1190 vlc_tbl = vlc_tbl1;
1191 proc_mel_encode = proc_mel_encode2;
1192 proc_vlc_encode = proc_vlc_encode2;
1193 }
1194
1195 ms_terminate(&ms);
1196 terminate_mel_vlc(&mel, &vlc);
1197
1198 //copy to elastic
1199 lengths[0] = mel.pos + vlc.pos + ms.pos;
1200 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1201 memcpy(coded->buf, ms.buf, ms.pos);
1202 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1203 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1204
1205 // put in the interface locator word
1206 ui32 num_bytes = mel.pos + vlc.pos;
1207 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1208 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1209 coded->buf[lengths[0]-2] =
1210 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1211
1212 coded->avail_size -= lengths[0];
1213}
1214
1215} /* namespace local */
1216} /* namespace ojph */
1217
1218#endif
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition ojph_mem.cpp:113
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
bool initialize_block_encoder_tables_avx512()
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
static bool tables_initialized
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
void ojph_encode_codeblock_avx512(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition ojph_arch.h:152
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define ojph_min(a, b)
Definition ojph_defs.h:76
#define ojph_unused(x)
Definition ojph_defs.h:78
#define OJPH_ERROR(t,...)