38#if defined(OJPH_ARCH_X86_64)
50#ifdef OJPH_COMPILER_MSVC
52 #define unlikely(x) (x)
54 #define likely(x) __builtin_expect((x), 1)
55 #define unlikely(x) __builtin_expect((x), 0)
73 static ui32 ulvc_cwd_pre[33];
74 static int ulvc_cwd_pre_len[33];
75 static ui32 ulvc_cwd_suf[33];
76 static int ulvc_cwd_suf_len[33];
81 struct vlc_src_table {
int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
82 vlc_src_table tbl0[] = {
85 size_t tbl0_size =
sizeof(tbl0) /
sizeof(vlc_src_table);
87 si32 pattern_popcnt[16];
88 for (
ui32 i = 0; i < 16; ++i)
91 vlc_src_table* src_tbl = tbl0;
93 size_t tbl_size = tbl0_size;
94 for (
int i = 0; i < 2048; ++i)
96 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
97 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
101 vlc_src_table *best_entry = NULL;
105 for (
size_t j = 0; j < tbl_size; ++j)
107 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
108 if (src_tbl[j].u_off == 1)
109 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
113 int ones_count = pattern_popcnt[src_tbl[j].e_k];
114 if (ones_count >= best_e_k)
116 best_entry = src_tbl + j;
117 best_e_k = ones_count;
124 for (
size_t j = 0; j < tbl_size; ++j)
126 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
127 if (src_tbl[j].u_off == 0)
129 best_entry = src_tbl + j;
135 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
140 vlc_src_table tbl1[] = {
143 size_t tbl1_size =
sizeof(tbl1) /
sizeof(vlc_src_table);
147 tbl_size = tbl1_size;
148 for (
int i = 0; i < 2048; ++i)
150 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
151 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
155 vlc_src_table *best_entry = NULL;
159 for (
size_t j = 0; j < tbl_size; ++j)
161 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
162 if (src_tbl[j].u_off == 1)
163 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
167 int ones_count = pattern_popcnt[src_tbl[j].e_k];
168 if (ones_count >= best_e_k)
170 best_entry = src_tbl + j;
171 best_e_k = ones_count;
178 for (
size_t j = 0; j < tbl_size; ++j)
180 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
181 if (src_tbl[j].u_off == 0)
183 best_entry = src_tbl + j;
189 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
202 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
203 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
204 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
205 ulvc_cwd_pre_len[2] = 2;
206 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
207 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
208 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
209 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
210 ulvc_cwd_suf_len[2] = 0;
211 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
212 for (
int i = 5; i < 33; ++i)
215 ulvc_cwd_pre_len[i] = 3;
216 ulvc_cwd_suf[i] = (
ui32)(i-5);
217 ulvc_cwd_suf_len[i] = 5;
259 melp->buf_size = buffer_size;
260 melp->remaining_bits = 8;
271 melp->tmp = (melp->tmp << 1) + v;
272 melp->remaining_bits--;
273 if (melp->remaining_bits == 0) {
274 melp->buf[melp->pos++] = (
ui8)melp->tmp;
275 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
285 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
289 if (melp->run >= melp->threshold) {
292 melp->k =
ojph_min(12, melp->k + 1);
293 melp->threshold = 1 << mel_exp[melp->k];
297 int t = mel_exp[melp->k];
303 melp->threshold = 1 << mel_exp[melp->k];
310 struct vlc_struct_avx512 {
318 bool last_greater_than_8F;
325 vlcp->buf = data + buffer_size - 1;
327 vlcp->buf_size = buffer_size;
332 vlcp->last_greater_than_8F =
true;
339 vlcp->tmp |= (
ui64)cwd << vlcp->used_bits;
340 vlcp->used_bits += cwd_len;
342 while (vlcp->used_bits >= 8) {
345 if (unlikely(vlcp->last_greater_than_8F)) {
346 tmp = vlcp->tmp & 0x7F;
348 if (likely(tmp != 0x7F)) {
349 tmp = vlcp->tmp & 0xFF;
350 *(vlcp->buf - vlcp->pos) = tmp;
351 vlcp->last_greater_than_8F = tmp > 0x8F;
353 vlcp->used_bits -= 8;
355 *(vlcp->buf - vlcp->pos) = tmp;
356 vlcp->last_greater_than_8F =
false;
358 vlcp->used_bits -= 7;
362 tmp = vlcp->tmp & 0xFF;
363 *(vlcp->buf - vlcp->pos) = tmp;
364 vlcp->last_greater_than_8F = tmp > 0x8F;
366 vlcp->used_bits -= 8;
382 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
383 *(vlcp->buf - vlcp->pos) = 0x7f;
386 vlcp->used_bits -= 7;
389 melp->tmp = melp->tmp << melp->remaining_bits;
390 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
391 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
392 if ((mel_mask | vlc_mask) == 0)
395 if (melp->pos >= melp->buf_size)
396 OJPH_ERROR(0x00020003,
"mel encoder's buffer is full");
397 ui8 vlcp_tmp = (
ui8)vlcp->tmp;
398 int fuse = melp->tmp | vlcp_tmp;
399 if ( ( ((fuse ^ melp->tmp) & mel_mask)
400 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
401 && (fuse != 0xFF) && vlcp->pos > 1)
403 melp->buf[melp->pos++] = (
ui8)fuse;
407 if (vlcp->pos >= vlcp->buf_size)
408 OJPH_ERROR(0x00020004,
"vlc encoder's buffer is full");
409 melp->buf[melp->pos++] = (
ui8)melp->tmp;
410 *(vlcp->buf - vlcp->pos) = (
ui8)vlcp_tmp;
435 msp->buf_size = buffer_size;
447 if (msp->pos >= msp->buf_size)
448 OJPH_ERROR(0x00020005,
"magnitude sign encoder's buffer is full");
449 int t =
ojph_min(msp->max_bits - msp->used_bits, cwd_len);
450 msp->tmp |= ((
ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
454 if (msp->used_bits >= msp->max_bits)
456 msp->buf[msp->pos++] = (
ui8)msp->tmp;
457 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
470 int t = msp->max_bits - msp->used_bits;
471 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
473 if (msp->tmp != 0xFF)
475 if (msp->pos >= msp->buf_size)
476 OJPH_ERROR(0x00020006,
"magnitude sign encoder's buffer is full");
477 msp->buf[msp->pos++] = (
ui8)msp->tmp;
480 else if (msp->max_bits == 7)
484#define ZERO _mm512_setzero_epi32()
485#define ONE _mm512_set1_epi32(1)
488static void print_epi32(
const char *msg, __m512i &val)
490 uint32_t A[16] = {0};
492 _mm512_store_epi32(A, val);
495 for (
int i = 0; i < 16; ++i) {
502static void proc_pixel(__m512i *src_vec,
ui32 p,
503 __m512i *eq_vec, __m512i *s_vec,
504 __m512i &rho_vec, __m512i &e_qmax_vec)
512 for (
ui32 i = 0; i < 4; ++i) {
514 val_vec[i] = _mm512_add_epi32(src_vec[i], src_vec[i]);
517 val_vec[i] = _mm512_srli_epi32(val_vec[i], p);
520 val_vec[i] = _mm512_and_epi32(val_vec[i], _mm512_set1_epi32((
int)~1u));
523 val_mask[i] = _mm512_cmpneq_epi32_mask(val_vec[i], ZERO);
530 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
531 _eq_vec[i] = _mm512_mask_lzcnt_epi32(ZERO, val_mask[i], val_vec[i]);
532 _eq_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i],
533 _mm512_set1_epi32(32), _eq_vec[i]);
540 val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
541 _s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
543 _mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
547 val_vec[0] = _mm512_mask_mov_epi32(ZERO, val_mask[0], ONE);
548 val_vec[1] = _mm512_mask_mov_epi32(ZERO, val_mask[1], ONE);
549 val_vec[2] = _mm512_mask_mov_epi32(ZERO, val_mask[2], ONE);
550 val_vec[3] = _mm512_mask_mov_epi32(ZERO, val_mask[3], ONE);
553 const __m512i idx[2] = {
554 _mm512_set_epi32(14, 12, 10, 8, 6, 4, 2, 0, 14, 12, 10, 8, 6, 4, 2, 0),
555 _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 15, 13, 11, 9, 7, 5, 3, 1),
569 for (
ui32 i = 0; i < 4; ++i) {
571 ui32 o_idx = i & 0x1;
573 eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
574 eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
578 s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
579 s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
583 _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
584 _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
587 _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
589 e_qmax_vec = _mm512_max_epi32(e_qmax_vec, eq_vec[i]);
592 rho_vec = _mm512_or_epi32(_rho_vec[0], _rho_vec[1]);
593 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[2]);
594 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[3]);
610static void rotate_matrix(__m512i *matrix)
613 _matrix[0] = _mm512_unpacklo_epi32(matrix[0], matrix[1]);
614 _matrix[1] = _mm512_unpackhi_epi32(matrix[0], matrix[1]);
615 _matrix[2] = _mm512_unpacklo_epi32(matrix[2], matrix[3]);
616 _matrix[3] = _mm512_unpackhi_epi32(matrix[2], matrix[3]);
618 matrix[0] = _mm512_unpacklo_epi64(_matrix[0], _matrix[2]);
619 matrix[1] = _mm512_unpackhi_epi64(_matrix[0], _matrix[2]);
620 matrix[2] = _mm512_unpacklo_epi64(_matrix[1], _matrix[3]);
621 matrix[3] = _mm512_unpackhi_epi64(_matrix[1], _matrix[3]);
623 _matrix[0] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0x88);
624 _matrix[1] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0x88);
625 _matrix[2] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0xDD);
626 _matrix[3] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0xDD);
628 matrix[0] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0x88);
629 matrix[1] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0x88);
630 matrix[2] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0xDD);
631 matrix[3] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0xDD);
634static void proc_ms_encode(
ms_struct *msp,
644 auto tmp = _mm512_and_epi32(tuple_vec, ONE);
645 tmp = _mm512_sub_epi32(uq_vec, tmp);
646 auto tmp1 = _mm512_and_epi32(rho_vec, ONE);
647 auto mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
648 m_vec[0] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
651 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(2));
652 tmp = _mm512_srli_epi32(tmp, 1);
653 tmp = _mm512_sub_epi32(uq_vec, tmp);
654 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
655 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
656 m_vec[1] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
659 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(4));
660 tmp = _mm512_srli_epi32(tmp, 2);
661 tmp = _mm512_sub_epi32(uq_vec, tmp);
662 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
663 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
664 m_vec[2] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
667 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(8));
668 tmp = _mm512_srli_epi32(tmp, 3);
669 tmp = _mm512_sub_epi32(uq_vec, tmp);
670 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
671 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
672 m_vec[3] = _mm512_mask_mov_epi32(ZERO, mask, tmp);
674 rotate_matrix(m_vec);
686 rotate_matrix(s_vec);
694 for (
ui32 i = 0; i < 4; ++i) {
698 _mm512_storeu_si512(cwd_len, m_vec[i]);
699 tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
700 tmp = _mm512_sub_epi32(tmp, ONE);
701 tmp = _mm512_and_epi32(tmp, s_vec[i]);
702 _mm512_storeu_si512(cwd, tmp);
704 for (
ui32 j = 0; j < 8; ++j) {
707 _cwd_len = cwd_len[idx];
708 _cwd |= ((
ui64)cwd[idx + 1]) << _cwd_len;
709 _cwd_len += cwd_len[idx + 1];
715static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
725 auto u_q_mask = _mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
727 auto mask = _mm512_cmpeq_epi32_mask(eq_vec[0], e_qmax_vec);
728 auto tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
729 auto eps_vec = _mm512_mask_mov_epi32(ZERO, u_q_mask, tmp);
731 mask = _mm512_cmpeq_epi32_mask(eq_vec[1], e_qmax_vec);
732 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
733 tmp = _mm512_slli_epi32(tmp, 1);
734 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
736 mask = _mm512_cmpeq_epi32_mask(eq_vec[2], e_qmax_vec);
737 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
738 tmp = _mm512_slli_epi32(tmp, 2);
739 eps_vec = _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
741 mask = _mm512_cmpeq_epi32_mask(eq_vec[3], e_qmax_vec);
742 tmp = _mm512_mask_mov_epi32(ZERO, mask, ONE);
743 tmp = _mm512_slli_epi32(tmp, 3);
745 return _mm512_mask_or_epi32(ZERO, u_q_mask, eps_vec, tmp);
748static void update_lep(
ui32 x, __m512i &prev_e_val_vec,
749 __m512i *eq_vec, __m512i *e_val_vec,
750 const __m512i left_shift)
756 auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
757 left_shift, eq_vec[3]);
758 prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
760 e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
764static void update_lcxp(
ui32 x, __m512i &prev_cx_val_vec,
765 __m512i &rho_vec, __m512i *cx_val_vec,
766 const __m512i left_shift)
772 auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
773 left_shift, rho_vec);
774 prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
777 tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
778 tmp = _mm512_srli_epi32(tmp, 3);
780 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
781 tmp1 = _mm512_srli_epi32(tmp1, 1);
782 cx_val_vec[x] = _mm512_or_epi32(tmp, tmp1);
785static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
786 __m512i &eps_vec,
ui32 *vlc_tbl)
789 auto tmp = _mm512_slli_epi32(cq_vec, 8);
790 auto tmp1 = _mm512_slli_epi32(rho_vec, 4);
791 tmp = _mm512_add_epi32(tmp, tmp1);
792 tmp = _mm512_add_epi32(tmp, eps_vec);
793 return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
796static __m512i proc_cq1(
ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
797 const __m512i right_shift)
804 auto tmp = _mm512_srli_epi32(rho_vec, 1);
805 auto tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(1));
806 return _mm512_or_epi32(tmp, tmp1);
809static __m512i proc_cq2(
ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
810 const __m512i right_shift)
814 auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
815 auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
816 auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
817 tmp = _mm512_mask_permutexvar_epi32(tmp, 0xC000, right_shift, lcxp2_vec);
818 tmp = _mm512_slli_epi32(tmp, 2);
819 auto tmp1 = _mm512_mask_mov_epi32(lcxp1_vec, 0x8000, lcxp2_vec);
820 tmp = _mm512_add_epi32(tmp1, tmp);
822 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
823 tmp1 = _mm512_srli_epi32(tmp1, 1);
824 tmp = _mm512_or_epi32(tmp, tmp1);
826 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
827 tmp1 = _mm512_srli_epi32(tmp1, 2);
829 return _mm512_or_epi32(tmp, tmp1);
832using fn_proc_cq = __m512i (*)(
ui32, __m512i *, __m512i &,
const __m512i);
834static void proc_mel_encode1(
mel_struct *melp, __m512i &cq_vec,
835 __m512i &rho_vec, __m512i u_q_vec,
ui32 ignore,
836 const __m512i right_shift)
840 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
842 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
846 auto tmp = _mm512_permutexvar_epi32(right_shift, u_q_vec);
847 auto tmp1 = _mm512_min_epi32(u_q_vec, tmp);
848 auto mel_bit2 = (
ui16)_mm512_cmpgt_epi32_mask(tmp1, _mm512_set1_epi32(2));
851 auto mel_need_encode2 = (
ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
853 mel_need_encode2 & (
ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
855 ui32 i_max = 16 - (ignore / 2);
857 for (
ui32 i = 0; i < i_max; i += 2) {
859 if (0 != (mel_need_encode & mask)) {
864 auto mask = 1 << (i + 1);
865 if (0 != (mel_need_encode & mask)) {
870 if (0 != (mel_need_encode2 & mask)) {
876static void proc_mel_encode2(
mel_struct *melp, __m512i &cq_vec,
877 __m512i &rho_vec, __m512i u_q_vec,
ui32 ignore,
878 const __m512i right_shift)
885 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec, ZERO);
887 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec, ZERO);
890 ui32 i_max = 16 - (ignore / 2);
892 for (
ui32 i = 0; i < i_max; ++i) {
894 if (0 != (mel_need_encode & mask)) {
900using fn_proc_mel_encode = void (*)(
mel_struct *, __m512i &, __m512i &,
901 __m512i,
ui32,
const __m512i);
903static void proc_vlc_encode1(vlc_struct_avx512 *vlcp,
ui32 *tuple,
906 ui32 i_max = 16 - (ignore / 2);
908 for (
ui32 i = 0; i < i_max; i += 2) {
910 ui32 val = tuple[i + 0] >> 4;
911 int size = tuple[i + 0] & 7;
915 val |= (tuple[i + 1] >> 4) << size;
916 size += tuple[i + 1] & 7;
919 if (u_q[i] > 2 && u_q[i + 1] > 2) {
921 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
922 size += ulvc_cwd_pre_len[u_q[i] - 2];
925 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
926 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
929 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
930 size += ulvc_cwd_suf_len[u_q[i] - 2];
933 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
934 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
936 }
else if (u_q[i] > 2 && u_q[i + 1] > 0) {
938 val |= (ulvc_cwd_pre[u_q[i]]) << size;
939 size += ulvc_cwd_pre_len[u_q[i]];
942 val |= (u_q[i + 1] - 1) << size;
946 val |= (ulvc_cwd_suf[u_q[i]]) << size;
947 size += ulvc_cwd_suf_len[u_q[i]];
951 val |= (ulvc_cwd_pre[u_q[i]]) << size;
952 size += ulvc_cwd_pre_len[u_q[i]];
955 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
956 size += ulvc_cwd_pre_len[u_q[i + 1]];
959 val |= (ulvc_cwd_suf[u_q[i]]) << size;
960 size += ulvc_cwd_suf_len[u_q[i]];
963 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
964 size += ulvc_cwd_suf_len[u_q[i + 1]];
971static void proc_vlc_encode2(vlc_struct_avx512 *vlcp,
ui32 *tuple,
974 ui32 i_max = 16 - (ignore / 2);
976 for (
ui32 i = 0; i < i_max; i += 2) {
978 ui32 val = tuple[i + 0] >> 4;
979 int size = tuple[i + 0] & 7;
983 val |= (tuple[i + 1] >> 4) << size;
984 size += tuple[i + 1] & 7;
988 val |= ulvc_cwd_pre[u_q[i]] << size;
989 size += ulvc_cwd_pre_len[u_q[i]];
992 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
993 size += ulvc_cwd_pre_len[u_q[i + 1]];
996 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
997 size += ulvc_cwd_suf_len[u_q[i + 0]];
1000 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
1001 size += ulvc_cwd_suf_len[u_q[i + 1]];
1007using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *,
ui32 *,
ui32 *,
ui32);
1012 ojph::mem_elastic_allocator *elastic,
1013 ojph::coded_lists *& coded)
1017 ui32 width = (_width + 31) & ~31u;
1018 ui32 ignore = width - _width;
1019 const int ms_size = (16384 * 16 + 14) / 15;
1020 const int mel_vlc_size = 3072;
1021 const int mel_size = 192;
1022 const int vlc_size = mel_vlc_size - mel_size;
1024 ui8 ms_buf[ms_size];
1025 ui8 mel_vlc_buf[mel_vlc_size];
1026 ui8 *mel_buf = mel_vlc_buf;
1027 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1031 vlc_struct_avx512 vlc;
1034 ms_init(&ms, ms_size, ms_buf);
1036 ui32 p = 30 - missing_msbs;
1047 const __m512i right_shift = _mm512_set_epi32(
1048 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
1051 const __m512i left_shift = _mm512_set_epi32(
1052 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15
1055 __m512i e_val_vec[33];
1056 for (
ui32 i = 0; i < 32; ++i) {
1057 e_val_vec[i] = ZERO;
1059 __m512i prev_e_val_vec = ZERO;
1061 __m512i cx_val_vec[33];
1062 __m512i prev_cx_val_vec = ZERO;
1064 __m512i prev_cq_vec = ZERO;
1076 ui32 n_loop = (width + 31) / 32;
1079 fn_proc_cq proc_cq = proc_cq1;
1080 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1081 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1084 for (
ui32 y = 0; y < height; y += 2)
1086 e_val_vec[n_loop] = prev_e_val_vec;
1088 tmp = _mm512_and_epi32(prev_cx_val_vec, _mm512_set1_epi32(8));
1089 tmp = _mm512_srli_epi32(tmp, 3);
1090 cx_val_vec[n_loop] = tmp;
1092 prev_e_val_vec = ZERO;
1093 prev_cx_val_vec = ZERO;
1095 ui32 *sp = buf + y * stride;
1098 for (
ui32 x = 0; x < n_loop; ++x) {
1102 ui32 mask32 = 0xFFFFFFFFu;
1103 si32 entries = true_x + 32 - (
si32)_width;
1104 mask32 >>= ((entries >= 0) ? entries : 0);
1105 __mmask16 load_mask0 = _cvtu32_mask16(mask32);
1106 __mmask16 load_mask1 = _cvtu32_mask16(mask32 >> 16);
1109 src_vec[0] = _mm512_maskz_loadu_epi32(load_mask0, sp);
1110 src_vec[2] = _mm512_maskz_loadu_epi32(load_mask1, sp + 16);
1112 if (y + 1 < height) {
1113 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1115 _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
1128 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1131 tmp = _mm512_permutexvar_epi32(right_shift, e_val_vec[x]);
1132 tmp = _mm512_mask_permutexvar_epi32(tmp, 0x8000, right_shift,
1134 auto mask = _mm512_cmpgt_epi32_mask(e_val_vec[x], tmp);
1135 auto max_e_vec = _mm512_mask_mov_epi32(tmp, mask, e_val_vec[x]);
1136 max_e_vec = _mm512_sub_epi32(max_e_vec, ONE);
1139 tmp = _mm512_max_epi32(max_e_vec, ONE);
1140 tmp1 = _mm512_sub_epi32(rho_vec, ONE);
1141 tmp1 = _mm512_and_epi32(rho_vec, tmp1);
1142 mask = _mm512_cmpneq_epi32_mask(tmp1, ZERO);
1143 kappa_vec = _mm512_mask_mov_epi32(ONE, mask, tmp);
1148 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1149 auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
1151 prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
1154 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1155 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1159 auto uq_vec = _mm512_max_epi32(kappa_vec, e_qmax_vec);
1160 auto u_q_vec = _mm512_sub_epi32(uq_vec, kappa_vec);
1162 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1163 __m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1164 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1166 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1169 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1179 tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1180 _mm512_storeu_si512(tuple, tuple_vec);
1181 _mm512_storeu_si512(u_q, u_q_vec);
1182 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1185 tmp = _mm512_permutexvar_epi32(right_shift, cx_val_vec[0]);
1186 tmp = _mm512_slli_epi32(tmp, 2);
1187 prev_cq_vec = _mm512_maskz_add_epi32(0x1, tmp, cx_val_vec[0]);
1191 proc_mel_encode = proc_mel_encode2;
1192 proc_vlc_encode = proc_vlc_encode2;
1199 lengths[0] = mel.pos + vlc.pos + ms.pos;
1200 elastic->
get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1201 memcpy(coded->
buf, ms.buf, ms.pos);
1202 memcpy(coded->
buf + ms.pos, mel.buf, mel.pos);
1203 memcpy(coded->
buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1206 ui32 num_bytes = mel.pos + vlc.pos;
1207 coded->
buf[lengths[0]-1] = (
ui8)(num_bytes >> 4);
1208 coded->
buf[lengths[0]-2] = coded->
buf[lengths[0]-2] & 0xF0;
1209 coded->
buf[lengths[0]-2] =
1210 (
ui8)(coded->
buf[lengths[0]-2] | (num_bytes & 0xF));
void get_buffer(ui32 needed_bytes, coded_lists *&p)
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
bool initialize_block_encoder_tables_avx512()
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
static bool tables_initialized
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
void ojph_encode_codeblock_avx512(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static ui32 population_count(ui32 val)
#define OJPH_ERROR(t,...)