43#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
101 ui32 val = 0xFFFFFFFF;
102 if (melp->size > 4) {
103 val = *(
ui32*)melp->data;
107 else if (melp->size > 0)
110 while (melp->size > 1) {
111 ui32 v = *melp->data++;
112 ui32 m = ~(0xFFu << i);
113 val = (val & m) | (v << i);
118 ui32 v = *melp->data++;
120 ui32 m = ~(0xFFu << i);
121 val = (val & m) | (v << i);
126 int bits = 32 - melp->unstuff;
133 bool unstuff = ((val & 0xFF) == 0xFF);
135 t = t << (8 - unstuff);
138 t |= (val>>8) & 0xFF;
139 unstuff = (((val >> 8) & 0xFF) == 0xFF);
141 t = t << (8 - unstuff);
143 t |= (val>>16) & 0xFF;
144 unstuff = (((val >> 16) & 0xFF) == 0xFF);
146 t = t << (8 - unstuff);
148 t |= (val>>24) & 0xFF;
149 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
153 melp->tmp |= ((
ui64)t) << (64 - bits - melp->bits);
175 static const int mel_exp[13] = {
176 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
185 while (melp->bits >= 6 && melp->num_runs < 8)
187 int eval = mel_exp[melp->k];
189 if (melp->tmp & (1ull<<63))
193 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;
200 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
201 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0;
202 melp->tmp <<= eval + 1;
203 melp->bits -= eval + 1;
204 run = (run << 1) + 1;
206 eval = melp->num_runs * 7;
207 melp->runs &= ~((
ui64)0x3F << eval);
208 melp->runs |= ((
ui64)run) << eval;
226 melp->data = bbuf + lcup - scup;
229 melp->unstuff =
false;
230 melp->size = scup - 1;
238 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
239 for (
int i = 0; i < num; ++i) {
240 assert(melp->unstuff ==
false || melp->data[0] <= 0x8F);
241 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;
243 if (melp->size == 1) d |= 0xF;
245 melp->data += melp->size-- > 0;
246 int d_bits = 8 - melp->unstuff;
247 melp->tmp = (melp->tmp << d_bits) | d;
248 melp->bits += d_bits;
249 melp->unstuff = ((d & 0xFF) == 0xFF);
252 melp->tmp <<= (64 - melp->bits);
265 if (melp->num_runs == 0)
268 int t = melp->runs & 0x7F;
321 val = *(
ui32*)(vlcp->data - 3);
325 else if (vlcp->size > 0)
328 while (vlcp->size > 0) {
329 ui32 v = *vlcp->data--;
336 __m128i tmp_vec = _mm_set1_epi32((int32_t)val);
337 tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0));
338 tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff));
340 __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F));
341 bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3);
342 unstuff_vec = _mm_slli_si128(unstuff_vec, 4);
343 unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0);
345 __m128i val_7f = _mm_set1_epi32(0x7F);
346 __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f);
347 unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f);
348 unstuff_vec = _mm_srli_epi32(unstuff_vec, 31);
350 __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec);
351 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
352 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
353 ui32 total_bits = (
ui32)_mm_extract_epi32(inc_sum, 3);
355 __m128i final_shift = _mm_slli_si128(inc_sum, 4);
356 tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift);
357 tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8));
359 ui64 tmp = (
ui32)_mm_cvtsi128_si32(tmp_vec) | (
ui32)_mm_extract_epi32(tmp_vec, 1);
361 vlcp->unstuff = unstuff_next;
362 vlcp->tmp |= tmp << vlcp->bits;
363 vlcp->bits += total_bits;
384 vlcp->data = data + lcup - 2;
387 vlcp->size = scup - 2;
389 ui32 d = *vlcp->data--;
391 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7);
392 vlcp->unstuff = (d | 0xF) > 0x8F;
399 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
400 int tnum = num < vlcp->size ? num : vlcp->size;
401 for (
int i = 0; i < tnum; ++i) {
405 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
406 vlcp->tmp |= d << vlcp->bits;
407 vlcp->bits += d_bits;
408 vlcp->unstuff = d > 0x8F;
430 return (
ui32)vlcp->tmp;
442 assert(num_bits <= vlcp->bits);
443 vlcp->tmp >>= num_bits;
444 vlcp->bits -= num_bits;
445 return (
ui32)vlcp->tmp;
468 val = *(
ui32*)(mrp->data - 3);
472 else if (mrp->size > 0)
475 while (mrp->size > 0) {
476 ui32 v = *mrp->data--;
484 ui32 bits, tmp = val >> 24;
487 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
488 bool unstuff = (val >> 24) > 0x8F;
491 tmp |= ((val >> 16) & 0xFF) << bits;
492 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
493 unstuff = ((val >> 16) & 0xFF) > 0x8F;
495 tmp |= ((val >> 8) & 0xFF) << bits;
496 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
497 unstuff = ((val >> 8) & 0xFF) > 0x8F;
499 tmp |= (val & 0xFF) << bits;
500 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
501 unstuff = (val & 0xFF) > 0x8F;
503 mrp->tmp |= (
ui64)tmp << mrp->bits;
505 mrp->unstuff = unstuff;
526 mrp->data = data + lcup + len2 - 1;
536 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
537 for (
int i = 0; i < num; ++i) {
540 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
542 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
543 mrp->tmp |= d << mrp->bits;
545 mrp->unstuff = d > 0x8F;
566 return (
ui32)mrp->tmp;
577 assert(num_bits <= mrp->bits);
578 mrp->tmp >>= num_bits;
579 mrp->bits -= num_bits;
580 return (
ui32)mrp->tmp;
587 struct frwd_struct_avx2 {
617 assert(msp->bits <= 128);
619 __m128i offset, val, validity, all_xff;
620 val = _mm_loadu_si128((__m128i*)msp->data);
621 int bytes = msp->size >= 16 ? 16 : msp->size;
622 validity = _mm_set1_epi8((
char)bytes);
626 offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
627 validity = _mm_cmpgt_epi8(validity, offset);
628 all_xff = _mm_set1_epi8(-1);
631 __m128i t = _mm_xor_si128(validity, all_xff);
632 val = _mm_or_si128(t, val);
635 val = _mm_and_si128(validity, val);
640 ff_bytes = _mm_cmpeq_epi8(val, all_xff);
641 ff_bytes = _mm_and_si128(ff_bytes, validity);
642 ui32 flags = (
ui32)_mm_movemask_epi8(ff_bytes);
644 ui32 next_unstuff = flags >> 16;
645 flags |= msp->unstuff;
657 t = _mm_set1_epi8((
char)loc);
658 m = _mm_cmpgt_epi8(offset, t);
660 t = _mm_and_si128(m, val);
661 c = _mm_srli_epi64(t, 1);
662 t = _mm_srli_si128(t, 8);
663 t = _mm_slli_epi64(t, 63);
664 t = _mm_or_si128(t, c);
666 val = _mm_or_si128(t, _mm_andnot_si128(m, val));
670 assert(msp->bits >= 0 && msp->bits <= 128);
671 int cur_bytes = msp->bits >> 3;
672 int cur_bits = msp->bits & 7;
674 b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
675 b2 = _mm_slli_si128(val, 8);
676 b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
677 b1 = _mm_or_si128(b1, b2);
678 b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
679 b2 = _mm_or_si128(b1, b2);
680 _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
682 int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
683 cur_bytes = (msp->bits + (
ui32)consumed_bits + 7) >> 3;
684 int upper = _mm_extract_epi16(val, 7);
685 upper >>= consumed_bits - 128 + 16;
686 msp->tmp[cur_bytes] = (
ui8)upper;
688 msp->bits += (
ui32)bits;
689 msp->unstuff = next_unstuff;
690 assert(msp->unstuff == 0 || msp->unstuff == 1);
704 void frwd_init(frwd_struct_avx2 *msp,
const ui8* data,
int size)
707 _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
708 _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
709 _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
727 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
728 msp->bits -= num_bits;
730 __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
733 __m128i v0, v1, c0, c1, t;
734 v0 = _mm_loadu_si128(p);
735 v1 = _mm_loadu_si128(p + 1);
738 c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
739 t = _mm_srli_si128(v0, 8);
740 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
741 c0 = _mm_or_si128(c0, t);
742 t = _mm_slli_si128(v1, 8);
743 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
744 c0 = _mm_or_si128(c0, t);
746 _mm_storeu_si128((__m128i*)msp->tmp, c0);
748 c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
749 t = _mm_srli_si128(v1, 8);
750 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
751 c1 = _mm_or_si128(c1, t);
753 _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
767 if (msp->bits <= 128)
770 if (msp->bits <= 128)
773 __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
787 static inline __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct_avx2* magsgn,
ui32 p, __m128i& vn) {
788 __m256i row = _mm256_setzero_si256();
791 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
792 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
794 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
796 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
804 __m256i w0 = _mm256_srli_epi32(flags, 15);
805 m_n = _mm256_sub_epi32(U_q, w0);
806 m_n = _mm256_andnot_si256(insig, m_n);
810 __m256i inc_sum = m_n;
811 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
812 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
813 int total_mn1 = _mm256_extract_epi16(inc_sum, 6);
814 int total_mn2 = _mm256_extract_epi16(inc_sum, 14);
816 __m128i ms_vec0 = _mm_setzero_si128();
817 __m128i ms_vec1 = _mm_setzero_si128();
827 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
829 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4);
832 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
833 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
834 byte_idx = _mm256_shuffle_epi8(byte_idx,
835 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
836 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
837 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
838 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
839 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
842 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
844 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
845 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
847 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
848 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
849 d0 = _mm256_mullo_epi16(d0, bit_shift);
850 d0 = _mm256_srli_epi16(d0, 8);
851 d1 = _mm256_mullo_epi16(d1, bit_shift);
852 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((
si32)0xFF00FF00));
853 d0 = _mm256_or_si256(d0, d1);
857 __m256i ones = _mm256_set1_epi32(1);
858 __m256i twos = _mm256_set1_epi32(2);
859 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
860 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
861 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
862 w0 = _mm256_sub_epi32(twos, w0);
863 shift = _mm256_sllv_epi32(w0, U_q_m1);
864 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
867 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
868 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
869 w0 = _mm256_andnot_si256(w0, shift);
870 ms_vec = _mm256_or_si256(ms_vec, w0);
871 w0 = _mm256_slli_epi32(ms_vec, 31);
872 ms_vec = _mm256_or_si256(ms_vec, ones);
873 __m256i tvn = ms_vec;
874 ms_vec = _mm256_add_epi32(ms_vec, twos);
875 ms_vec = _mm256_slli_epi32(ms_vec, (
si32)p - 1);
876 ms_vec = _mm256_or_si256(ms_vec, w0);
877 row = _mm256_andnot_si256(insig, ms_vec);
879 ms_vec = _mm256_andnot_si256(insig, tvn);
881 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
883 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
884 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
901 static inline __m256i decode_four_quad16(
const __m128i inf_u_q, __m128i U_q, frwd_struct_avx2* magsgn,
ui32 p, __m128i& vn) {
907 __m256i row = _mm256_setzero_si256();
908 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
909 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
910 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
911 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
913 flags = _mm256_and_si256(w0,
914 _mm256_set_epi16((
si16)0x8880, 0x4440, 0x2220, 0x1110,
915 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
916 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
917 (
si16)0x8880, 0x4440, 0x2220, 0x1110));
918 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
919 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
921 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
922 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
923 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
924 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
932 w0 = _mm256_srli_epi16(flags, 15);
933 m_n = _mm256_sub_epi16(U_q_avx, w0);
934 m_n = _mm256_andnot_si256(insig, m_n);
938 __m256i inc_sum = m_n;
939 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
940 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
941 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
942 int total_mn1 = _mm256_extract_epi16(inc_sum, 7);
943 int total_mn2 = _mm256_extract_epi16(inc_sum, 15);
944 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2);
946 __m128i ms_vec0 = _mm_setzero_si128();
947 __m128i ms_vec1 = _mm_setzero_si128();
957 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
960 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
961 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
962 byte_idx = _mm256_shuffle_epi8(byte_idx,
963 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
964 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
965 0x0606, 0x0404, 0x0202, 0x0000));
966 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
967 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
968 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
969 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
972 __m256i bit_shift = _mm256_shuffle_epi8(
973 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
974 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
975 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
976 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
977 d0 = _mm256_mullo_epi16(d0, bit_shift);
978 d0 = _mm256_srli_epi16(d0, 8);
979 d1 = _mm256_mullo_epi16(d1, bit_shift);
980 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((
si16)0xFF00));
981 d0 = _mm256_or_si256(d0, d1);
984 __m256i shift, t0, t1, Uq0, Uq1;
985 __m256i ones = _mm256_set1_epi16(1);
986 __m256i twos = _mm256_set1_epi16(2);
987 __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones);
988 Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
989 Uq1 = _mm256_bsrli_epi128(U_q_m1, 14);
990 w0 = _mm256_sub_epi16(twos, w0);
991 t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1));
992 t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0));
994 __m128i t_0_sse = _mm256_castsi256_si128(t0);
995 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0));
996 __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1);
997 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1));
998 t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
1000 t_0_sse = _mm256_castsi256_si128(t1);
1001 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1));
1002 t_1_sse = _mm256_extracti128_si256(t1, 0x1);
1003 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1));
1004 t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
1006 shift = _mm256_or_si256(t0, t1);
1007 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
1010 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
1011 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
1012 w0 = _mm256_andnot_si256(w0, shift);
1013 ms_vec = _mm256_or_si256(ms_vec, w0);
1014 w0 = _mm256_slli_epi16(ms_vec, 15);
1015 ms_vec = _mm256_or_si256(ms_vec, ones);
1016 __m256i tvn = ms_vec;
1017 ms_vec = _mm256_add_epi16(ms_vec, twos);
1018 ms_vec = _mm256_slli_epi16(ms_vec, (
si32)p - 1);
1019 ms_vec = _mm256_or_si256(ms_vec, w0);
1020 row = _mm256_andnot_si256(insig, ms_vec);
1022 ms_vec = _mm256_andnot_si256(insig, tvn);
1024 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
1025 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
1026 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
1027 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
1028 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
1029 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
1030 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
1032 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
1033 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
1039 inline __m256i avx2_lzcnt_epi32(__m256i v) {
1041 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);
1043 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));
1044 v = _mm256_srli_epi32(v, 23);
1045 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);
1046 v = _mm256_min_epi16(v, _mm256_set1_epi32(32));
1069 ui32 missing_msbs,
ui32 num_passes,
1074 static bool insufficient_precision =
false;
1075 static bool modify_code =
false;
1076 static bool truncate_spp_mrp =
false;
1078 if (num_passes > 1 && lengths2 == 0)
1080 OJPH_WARN(0x00010001,
"A malformed codeblock that has more than "
1081 "one coding pass, but zero length for "
1082 "2nd and potential 3rd pass.");
1088 OJPH_WARN(0x00010002,
"We do not support more than 3 coding passes; "
1089 "This codeblocks has %d passes.",
1094 if (missing_msbs > 30)
1096 if (insufficient_precision ==
false)
1098 insufficient_precision =
true;
1099 OJPH_WARN(0x00010003,
"32 bits are not enough to decode this "
1100 "codeblock. This message will not be "
1101 "displayed again.");
1105 else if (missing_msbs == 30)
1107 if (modify_code ==
false) {
1109 OJPH_WARN(0x00010004,
"Not enough precision to decode the cleanup "
1110 "pass. The code can be modified to support "
1111 "this case. This message will not be "
1112 "displayed again.");
1116 else if (missing_msbs == 29)
1118 if (num_passes > 1) {
1120 if (truncate_spp_mrp ==
false) {
1121 truncate_spp_mrp =
true;
1122 OJPH_WARN(0x00010005,
"Not enough precision to decode the SgnProp "
1123 "nor MagRef passes; both will be skipped. "
1124 "This message will not be displayed "
1129 ui32 p = 30 - missing_msbs;
1135 OJPH_WARN(0x00010006,
"Wrong codeblock length.");
1141 lcup = (int)lengths1;
1143 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1144 if (scup < 2 || scup > lcup || scup > 4079)
1162 ui16 scratch[8 * 513] = {0};
1170 ui32 sstr = ((width + 2u) + 7u) & ~7u;
1172 assert((stride & 0x3) == 0);
1174 ui32 mmsbp2 = missing_msbs + 2;
1186 mel_init(&mel, coded_data, lcup, scup);
1188 rev_init(&vlc, coded_data, lcup, scup);
1198 for (
ui32 x = 0; x < width; sp += 4)
1217 t0 = (run == -1) ? t0 : 0;
1231 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1240 t1 =
vlc_tbl0[c_q + (vlc_val & 0x7F)];
1243 if (c_q == 0 && x < width)
1248 t1 = (run == -1) ? t1 : 0;
1253 t1 = x < width ? t1 : 0;
1262 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1270 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1271 if (uvlc_mode == 0xc0)
1275 uvlc_mode += (run == -1) ? 0x40 : 0;
1292 ui32 len = uvlc_entry & 0xF;
1293 ui32 tmp = vlc_val & ((1 << len) - 1);
1298 len = uvlc_entry & 0x7;
1300 ui16 u_q = (
ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len)));
1302 u_q = (
ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));
1308 for (
ui32 y = 2; y < height; y += 2)
1311 ui16 *sp = scratch + (y >> 1) * sstr;
1313 for (
ui32 x = 0; x < width; sp += 4)
1319 c_q |= ((sp[0 - (
si32)sstr] & 0xA0U) << 2);
1320 c_q |= ((sp[2 - (
si32)sstr] & 0x20U) << 4);
1336 t0 = (run == -1) ? t0 : 0;
1351 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1353 c_q |= sp[0 - (
si32)sstr] & 0x80;
1355 c_q |= ((sp[2 - (
si32)sstr] & 0xA0U) << 2);
1356 c_q |= ((sp[4 - (
si32)sstr] & 0x20U) << 4);
1365 t1 =
vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1368 if (c_q == 0 && x < width)
1373 t1 = (run == -1) ? t1 : 0;
1378 t1 = x < width ? t1 : 0;
1388 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1390 c_q |= sp[2 - (
si32)sstr] & 0x80;
1398 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1404 ui32 len = uvlc_entry & 0xF;
1405 ui32 tmp = vlc_val & ((1 << len) - 1);
1410 len = uvlc_entry & 0x7;
1412 ui16 u_q = (
ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1414 u_q = (
ui16)((uvlc_entry >> 3) + (tmp >> len));
1437 const int v_n_size = 512 + 16;
1438 ui32 v_n_scratch[2 * v_n_size] = {0};
1440 frwd_struct_avx2 magsgn;
1443 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
1447 ui32 *vp = v_n_scratch;
1448 ui32 *dp = decoded_data;
1451 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1453 __m128i vn = _mm_set1_epi32(2);
1455 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1456 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1458 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1459 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1460 if (!_mm256_testz_si256(w, w)) {
1464 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
1465 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1466 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1467 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1469 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1470 w0 = _mm_or_si128(w0, vn);
1471 _mm_storeu_si128((__m128i*)vp, w0);
1475 for (
ui32 y = 2; y < height; y += 2)
1479 ui32 *vp = v_n_scratch;
1480 ui16* sp = scratch + (y >> 1) * sstr;
1482 const __m256i avx_31 = _mm256_set1_epi32(31);
1483 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1484 const __m256i avx_1 = _mm256_set1_epi32(1);
1485 const __m256i avx_0 = _mm256_setzero_si256();
1487 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1488 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1489 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1490 v = _mm256_or_si256(v, v_p1);
1491 v = avx2_lzcnt_epi32(v);
1492 v = _mm256_sub_epi32(avx_31, v);
1494 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1495 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1496 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1497 gamma = _mm256_and_si256(gamma, w0);
1498 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1500 v = _mm256_andnot_si256(gamma, v);
1501 v = _mm256_max_epi32(v, avx_1);
1503 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1504 v = _mm256_add_epi32(inf_u_q, v);
1506 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1507 if (!_mm256_testz_si256(w0, w0)) {
1511 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1515 ui32 *vp = v_n_scratch;
1516 ui16 *sp = scratch + (y >> 1) * sstr;
1517 ui32 *dp = decoded_data + y * stride;
1520 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1522 __m128i vn = _mm_set1_epi32(2);
1524 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1525 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1527 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1528 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1530 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
1531 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1532 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1533 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1535 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1536 w0 = _mm_or_si128(w0, vn);
1537 _mm_storeu_si128((__m128i*)vp, w0);
1552 const int v_n_size = 512 + 16;
1553 ui16 v_n_scratch[v_n_size] = {0};
1554 ui32 v_n_scratch_32[v_n_size] = {0};
1556 frwd_struct_avx2 magsgn;
1561 ui16 *vp = v_n_scratch;
1562 ui32 *dp = decoded_data;
1565 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
1567 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1568 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
1569 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
1570 if (!_mm_testz_si128(w, w)) {
1574 __m128i vn = _mm_set1_epi16(2);
1575 __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
1577 w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
1578 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1580 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1581 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1583 _mm256_storeu_si256((__m256i*)dp, w0);
1584 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1588 for (
ui32 y = 2; y < height; y += 2) {
1591 ui16 *vp = v_n_scratch;
1592 ui32 *vp_32 = v_n_scratch_32;
1594 ui16* sp = scratch + (y >> 1) * sstr;
1595 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
1596 const __m256i avx_31 = _mm256_set1_epi32(31);
1597 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1598 const __m256i avx_1 = _mm256_set1_epi32(1);
1599 const __m256i avx_0 = _mm256_setzero_si256();
1601 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
1602 __m128i v = _mm_loadu_si128((__m128i*)vp);
1603 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
1604 v = _mm_or_si128(v, v_p1);
1606 __m256i v_avx = _mm256_cvtepu16_epi32(v);
1607 v_avx = avx2_lzcnt_epi32(v_avx);
1608 v_avx = _mm256_sub_epi32(avx_31, v_avx);
1610 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1611 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1612 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1613 gamma = _mm256_and_si256(gamma, w0);
1614 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1616 v_avx = _mm256_andnot_si256(gamma, v_avx);
1617 v_avx = _mm256_max_epi32(v_avx, avx_1);
1619 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1620 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
1622 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
1623 if (!_mm256_testz_si256(w0, w0)) {
1627 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
1631 ui16 *vp = v_n_scratch;
1632 ui32* vp_32 = v_n_scratch_32;
1633 ui16 *sp = scratch + (y >> 1) * sstr;
1634 ui32 *dp = decoded_data + y * stride;
1637 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1639 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1640 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1642 __m128i vn = _mm_set1_epi16(2);
1643 __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
1645 __m128i w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
1646 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1648 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1649 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1651 _mm256_storeu_si256((__m256i*)dp, w0);
1652 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1666 ui16*
const sigma = scratch;
1668 ui32 mstr = (width + 3u) >> 2;
1670 mstr = ((mstr + 2u) + 7u) & ~7u;
1678 const __m128i mask_3 = _mm_set1_epi32(0x30);
1679 const __m128i mask_C = _mm_set1_epi32(0xC0);
1680 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1681 for (y = 0; y < height; y += 4)
1683 ui16* sp = scratch + (y >> 1) * sstr;
1684 ui16* dp = sigma + (y >> 2) * mstr;
1685 for (
ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1687 __m128i s0, s1, u3, uC, t0, t1;
1689 s0 = _mm_loadu_si128((__m128i*)(sp));
1690 u3 = _mm_and_si128(s0, mask_3);
1691 u3 = _mm_srli_epi32(u3, 4);
1692 uC = _mm_and_si128(s0, mask_C);
1693 uC = _mm_srli_epi32(uC, 2);
1694 t0 = _mm_or_si128(u3, uC);
1696 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1697 u3 = _mm_and_si128(s1, mask_3);
1698 u3 = _mm_srli_epi32(u3, 2);
1699 uC = _mm_and_si128(s1, mask_C);
1700 t1 = _mm_or_si128(u3, uC);
1702 __m128i r = _mm_or_si128(t0, t1);
1703 r = _mm_shuffle_epi8(r, shuffle_mask);
1705 *(
ui32*)dp = (
ui32)_mm_extract_epi32(r, 0);
1711 ui16* dp = sigma + (y >> 2) * mstr;
1712 __m128i zero = _mm_setzero_si128();
1713 for (
ui32 x = 0; x < width; x += 32, dp += 8)
1714 _mm_storeu_si128((__m128i*)dp, zero);
1730 ui16 prev_row_sig[256 + 8] = {0};
1732 frwd_struct_avx2 sigprop;
1733 frwd_init<0>(&sigprop, coded_data + lengths1, (
int)lengths2);
1735 for (
ui32 y = 0; y < height; y += 4)
1737 ui32 pattern = 0xFFFFu;
1738 if (height - y < 4) {
1740 if (height - y < 3) {
1750 ui16 *prev_sig = prev_row_sig;
1751 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1752 ui32 *dpp = decoded_data + y * stride;
1753 for (
ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1758 pattern = pattern >> (s * 4);
1773 ui32 ns = *(
ui32*)(cur_sig + mstr);
1774 ui32 u = (ps & 0x88888888) >> 3;
1776 u |= (ns & 0x11111111) << 3;
1781 mbr |= (cs & 0x77777777) << 1;
1782 mbr |= (cs & 0xEEEEEEEE) >> 1;
1799 ui32 cwd = (
ui32)_mm_extract_epi16(cwd_vec, 0);
1802 ui32 col_mask = 0xFu;
1803 ui32 inv_sig = ~cs & pattern;
1804 for (
int i = 0; i < 16; i += 4, col_mask <<= 4)
1806 if ((col_mask & new_sig) == 0)
1810 ui32 sample_mask = 0x1111u & col_mask;
1811 if (new_sig & sample_mask)
1813 new_sig &= ~sample_mask;
1816 ui32 t = 0x33u << i;
1817 new_sig |= t & inv_sig;
1823 if (new_sig & sample_mask)
1825 new_sig &= ~sample_mask;
1828 ui32 t = 0x76u << i;
1829 new_sig |= t & inv_sig;
1835 if (new_sig & sample_mask)
1837 new_sig &= ~sample_mask;
1840 ui32 t = 0xECu << i;
1841 new_sig |= t & inv_sig;
1847 if (new_sig & sample_mask)
1849 new_sig &= ~sample_mask;
1852 ui32 t = 0xC8u << i;
1853 new_sig |= t & inv_sig;
1861 cwd |= (
ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1865 __m128i new_sig_vec = _mm_set1_epi16((
si16)new_sig);
1866 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1867 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1868 new_sig_vec = _mm_and_si128(new_sig_vec,
1869 _mm_set1_epi64x((
si64)0x8040201008040201));
1870 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1871 _mm_set1_epi64x((
si64)0x8040201008040201));
1875 __m128i inc_sum = new_sig_vec;
1876 inc_sum = _mm_abs_epi8(inc_sum);
1877 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1878 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1879 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1880 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1881 cnt += (
ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1883 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1887 cwd_vec = _mm_set1_epi16((
si16)cwd);
1888 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1889 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1890 cwd_vec = _mm_and_si128(cwd_vec,
1891 _mm_set1_epi64x((
si64)0x8040201008040201));
1892 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1893 _mm_set1_epi64x((
si64)0x8040201008040201));
1894 cwd_vec = _mm_abs_epi8(cwd_vec);
1898 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1902 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1903 __m128i val = _mm_set1_epi32(3 << (p - 2));
1905 for (
int c = 0; c < 4; ++ c) {
1906 __m128i s0, s0_ns, s0_val;
1908 s0 = _mm_load_si128((__m128i*)dp);
1912 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1913 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1916 s0_val = _mm_shuffle_epi8(v, m);
1917 s0_val = _mm_slli_epi32(s0_val, 31);
1918 s0_val = _mm_or_si128(s0_val, val);
1919 s0_val = _mm_and_si128(s0_val, s0_ns);
1922 s0 = _mm_or_si128(s0, s0_val);
1924 _mm_store_si128((__m128i*)dp, s0);
1927 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1934 *prev_sig = (
ui16)(new_sig);
1938 new_sig |= (t & 0x7777) << 1;
1939 new_sig |= (t & 0xEEEE) >> 1;
1952 rev_init_mrp(&magref, coded_data, (
int)lengths1, (
int)lengths2);
1954 for (
ui32 y = 0; y < height; y += 4)
1956 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1957 ui32 *dpp = decoded_data + y * stride;
1958 for (
ui32 i = 0; i < width; i += 4, dpp += 4)
1963 ui16 sig = *cur_sig++;
1971 __m128i sig_vec = _mm_set1_epi16((
si16)sig);
1972 sig_vec = _mm_shuffle_epi8(sig_vec,
1973 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1974 sig_vec = _mm_and_si128(sig_vec,
1975 _mm_set1_epi64x((
si64)0x8040201008040201));
1976 sig_vec = _mm_cmpeq_epi8(sig_vec,
1977 _mm_set1_epi64x((
si64)0x8040201008040201));
1978 sig_vec = _mm_abs_epi8(sig_vec);
1982 __m128i inc_sum = sig_vec;
1983 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1984 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1985 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1986 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1987 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1988 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1995 __m128i cwd_vec = _mm_set1_epi16((
si16)cwd);
1996 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1997 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1998 cwd_vec = _mm_and_si128(cwd_vec,
1999 _mm_set1_epi64x((
si64)0x8040201008040201));
2000 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2001 _mm_set1_epi64x((
si64)0x8040201008040201));
2002 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2003 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2004 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2008 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2010 for (
int c = 0; c < 4; ++c) {
2011 __m128i s0, s0_sig, s0_idx, s0_val;
2013 s0 = _mm_load_si128((__m128i*)dp);
2015 s0_sig = _mm_shuffle_epi8(sig_vec, m);
2016 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2018 s0_idx = _mm_shuffle_epi8(ex_sum, m);
2019 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2021 s0_val = _mm_andnot_si128(s0_sig, s0_val);
2023 s0_val = _mm_slli_epi32(s0_val, (
si32)p - 2);
2024 s0 = _mm_xor_si128(s0, s0_val);
2026 _mm_store_si128((__m128i*)dp, s0);
2029 m = _mm_add_epi32(m, _mm_set1_epi32(1));
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct32 *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct32.
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static void frwd_read(frwd_struct32 *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
static ui32 frwd_fetch(frwd_struct32 *msp)
Fetches 32 bits from the frwd_struct32 bitstream.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_init(frwd_struct32 *msp, const ui8 *data, int size)
Initialize frwd_struct32 struct and reads some bytes.
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
static ui32 count_leading_zeros(ui32 val)
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data