OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_decoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_block_decoder_avx2.cpp
35//***************************************************************************/
36
37//***************************************************************************/
41
42#include "ojph_arch.h"
43#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
44
45#include <string>
46#include <iostream>
47
48#include <cassert>
49#include <cstring>
50#include "ojph_block_common.h"
51#include "ojph_block_decoder.h"
52#include "ojph_message.h"
53
54#include <immintrin.h>
55
56namespace ojph {
57 namespace local {
58
59 //************************************************************************/
66 struct dec_mel_st {
67 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
68 k(0), num_runs(0), runs(0)
69 {}
70 // data decoding machinery
71 ui8* data;
72 ui64 tmp;
73 int bits;
74 int size;
75 bool unstuff;
76 int k;
77
78 // queue of decoded runs
79 int num_runs;
80 ui64 runs;
81 };
82
83 //************************************************************************/
95 static inline
96 void mel_read(dec_mel_st *melp)
97 {
98 if (melp->bits > 32) //there are enough bits in the tmp variable
99 return; // return without reading new data
100
101 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
102 if (melp->size > 4) { // if there is data in the MEL segment
103 val = *(ui32*)melp->data; // read 32 bits from MEL data
104 melp->data += 4; // advance pointer
105 melp->size -= 4; // reduce counter
106 }
107 else if (melp->size > 0)
108 { // 4 or less
109 int i = 0;
110 while (melp->size > 1) {
111 ui32 v = *melp->data++; // read one byte at a time
112 ui32 m = ~(0xFFu << i); // mask of location
113 val = (val & m) | (v << i);// put one byte in its correct location
114 --melp->size;
115 i += 8;
116 }
117 // size equal to 1
118 ui32 v = *melp->data++; // the one before the last is different
119 v |= 0xF; // MEL and VLC segments can overlap
120 ui32 m = ~(0xFFu << i);
121 val = (val & m) | (v << i);
122 --melp->size;
123 }
124
125 // next we unstuff them before adding them to the buffer
126 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
127 // the previously read byte requires
128 // unstuffing
129
130 // data is unstuffed and accumulated in t
131 // bits has the number of bits in t
132 ui32 t = val & 0xFF;
133 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
134 bits -= unstuff; // there is one less bit in t if unstuffing is needed
135 t = t << (8 - unstuff); // move up to make room for the next byte
136
137 //this is a repeat of the above
138 t |= (val>>8) & 0xFF;
139 unstuff = (((val >> 8) & 0xFF) == 0xFF);
140 bits -= unstuff;
141 t = t << (8 - unstuff);
142
143 t |= (val>>16) & 0xFF;
144 unstuff = (((val >> 16) & 0xFF) == 0xFF);
145 bits -= unstuff;
146 t = t << (8 - unstuff);
147
148 t |= (val>>24) & 0xFF;
149 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
150
151 // move t to tmp, and push the result all the way up, so we read from
152 // the MSB
153 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
154 melp->bits += bits; //increment the number of bits in tmp
155 }
156
157 //************************************************************************/
172 static inline
173 void mel_decode(dec_mel_st *melp)
174 {
175 static const int mel_exp[13] = { //MEL exponents
176 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
177 };
178
179 if (melp->bits < 6) // if there are less than 6 bits in tmp
180 mel_read(melp); // then read from the MEL bitstream
181 // 6 bits is the largest decodable MEL cwd
182
183 //repeat so long that there is enough decodable bits in tmp,
184 // and the runs store is not full (num_runs < 8)
185 while (melp->bits >= 6 && melp->num_runs < 8)
186 {
187 int eval = mel_exp[melp->k]; // number of bits associated with state
188 int run = 0;
189 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
190 { //one is found
191 run = 1 << eval;
192 run--; // consecutive runs of 0 events - 1
193 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
194 melp->tmp <<= 1; // consume one bit from tmp
195 melp->bits -= 1;
196 run = run << 1; // a stretch of zeros not terminating in one
197 }
198 else
199 { //0 is found
200 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
201 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
202 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
203 melp->bits -= eval + 1;
204 run = (run << 1) + 1; // a stretch of zeros terminating with one
205 }
206 eval = melp->num_runs * 7; // 7 bits per run
207 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
208 melp->runs |= ((ui64)run) << eval; // store the value in runs
209 melp->num_runs++; // increment count
210 }
211 }
212
213 //************************************************************************/
223 static inline
224 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
225 {
226 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
227 melp->bits = 0; // 0 bits in tmp
228 melp->tmp = 0; //
229 melp->unstuff = false; // no unstuffing
230 melp->size = scup - 1; // size is the length of MEL+VLC-1
231 melp->k = 0; // 0 for state
232 melp->num_runs = 0; // num_runs is 0
233 melp->runs = 0; //
234
235 //This code is borrowed; original is for a different architecture
236 //These few lines take care of the case where data is not at a multiple
237 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
238 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
239 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
240 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
241 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
242 //set data to 0xFF
243 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
244 // see the standard
245 melp->data += melp->size-- > 0; //increment if the end is not reached
246 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
247 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
248 melp->bits += d_bits; //increment tmp by number of bits
249 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
250 //unstuffing
251 }
252 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
253 // is the MSB
254 }
255
256 //************************************************************************/
262 static inline
263 int mel_get_run(dec_mel_st *melp)
264 {
265 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
266 mel_decode(melp);
267
268 int t = melp->runs & 0x7F; //retrieve one run
269 melp->runs >>= 7; // remove the retrieved run
270 melp->num_runs--;
271 return t; // return run
272 }
273
274 //************************************************************************/
278 struct rev_struct {
279 rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
280 {}
281 //storage
282 ui8* data;
283 ui64 tmp;
284 ui32 bits;
285 int size;
286 bool unstuff;
288 };
289
290 //************************************************************************/
310 static inline
311 void rev_read(rev_struct *vlcp)
312 {
313 //process 4 bytes at a time
314 if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
315 return; // reading 32 bits can overflow vlcp->tmp
316 ui32 val = 0;
317 //the next line (the if statement) needs to be tested first
318 if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
319 {
320 // (vlcp->data - 3) move pointer back to read 32 bits at once
321 val = *(ui32*)(vlcp->data - 3); // then read 32 bits
322 vlcp->data -= 4; // move data pointer back by 4
323 vlcp->size -= 4; // reduce available byte by 4
324 }
325 else if (vlcp->size > 0)
326 { // 4 or less
327 int i = 24;
328 while (vlcp->size > 0) {
329 ui32 v = *vlcp->data--; // read one byte at a time
330 val |= (v << i); // put byte in its correct location
331 --vlcp->size;
332 i -= 8;
333 }
334 }
335
336 __m128i tmp_vec = _mm_set1_epi32((int32_t)val);
337 tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0));
338 tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff));
339
340 __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F));
341 bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3);
342 unstuff_vec = _mm_slli_si128(unstuff_vec, 4);
343 unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0);
344
345 __m128i val_7f = _mm_set1_epi32(0x7F);
346 __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f);
347 unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f);
348 unstuff_vec = _mm_srli_epi32(unstuff_vec, 31);
349
350 __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec);
351 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
352 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
353 ui32 total_bits = (ui32)_mm_extract_epi32(inc_sum, 3);
354
355 __m128i final_shift = _mm_slli_si128(inc_sum, 4);
356 tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift);
357 tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8));
358
359 ui64 tmp = (ui32)_mm_cvtsi128_si32(tmp_vec) | (ui32)_mm_extract_epi32(tmp_vec, 1);
360
361 vlcp->unstuff = unstuff_next;
362 vlcp->tmp |= tmp << vlcp->bits;
363 vlcp->bits += total_bits;
364 }
365
366 //************************************************************************/
380 static inline
381 void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
382 {
383 //first byte has only the upper 4 bits
384 vlcp->data = data + lcup - 2;
385
386 //size can not be larger than this, in fact it should be smaller
387 vlcp->size = scup - 2;
388
389 ui32 d = *vlcp->data--; // read one byte (this is a half byte)
390 vlcp->tmp = d >> 4; // both initialize and set
391 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
392 vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
393
394 //This code is designed for an architecture that read address should
395 // align to the read size (address multiple of 4 if read size is 4)
396 //These few lines take care of the case where data is not at a multiple
397 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
398 // To read 32 bits, read from (vlcp->data - 3)
399 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
400 int tnum = num < vlcp->size ? num : vlcp->size;
401 for (int i = 0; i < tnum; ++i) {
402 ui64 d;
403 d = *vlcp->data--; // read one byte and move read pointer
404 //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
405 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
406 vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
407 vlcp->bits += d_bits;
408 vlcp->unstuff = d > 0x8F; // for next byte
409 }
410 vlcp->size -= tnum;
411 rev_read(vlcp); // read another 32 buts
412 }
413
414 //************************************************************************/
421 static inline
423 {
424 if (vlcp->bits < 32) // if there are less then 32 bits, read more
425 {
426 rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
427 if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
428 rev_read(vlcp); // read another 32
429 }
430 return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
431 }
432
433 //************************************************************************/
439 static inline
440 ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
441 {
442 assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
443 vlcp->tmp >>= num_bits; // remove bits
444 vlcp->bits -= num_bits; // decrement the number of bits
445 return (ui32)vlcp->tmp;
446 }
447
448 //************************************************************************/
459 static inline
460 void rev_read_mrp(rev_struct *mrp)
461 {
462 //process 4 bytes at a time
463 if (mrp->bits > 32)
464 return;
465 ui32 val = 0;
466 if (mrp->size > 3) // If there are 3 byte or more
467 { // (mrp->data - 3) move pointer back to read 32 bits at once
468 val = *(ui32*)(mrp->data - 3); // read 32 bits
469 mrp->data -= 4; // move back pointer
470 mrp->size -= 4; // reduce count
471 }
472 else if (mrp->size > 0)
473 {
474 int i = 24;
475 while (mrp->size > 0) {
476 ui32 v = *mrp->data--; // read one byte at a time
477 val |= (v << i); // put byte in its correct location
478 --mrp->size;
479 i -= 8;
480 }
481 }
482
483 //accumulate in tmp, and keep count in bits
484 ui32 bits, tmp = val >> 24;
485
486 //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
487 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
488 bool unstuff = (val >> 24) > 0x8F;
489
490 //process the next byte
491 tmp |= ((val >> 16) & 0xFF) << bits;
492 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
493 unstuff = ((val >> 16) & 0xFF) > 0x8F;
494
495 tmp |= ((val >> 8) & 0xFF) << bits;
496 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
497 unstuff = ((val >> 8) & 0xFF) > 0x8F;
498
499 tmp |= (val & 0xFF) << bits;
500 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
501 unstuff = (val & 0xFF) > 0x8F;
502
503 mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
504 mrp->bits += bits;
505 mrp->unstuff = unstuff; // next byte
506 }
507
508 //************************************************************************/
523 static inline
524 void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
525 {
526 mrp->data = data + lcup + len2 - 1;
527 mrp->size = len2;
528 mrp->unstuff = true;
529 mrp->bits = 0;
530 mrp->tmp = 0;
531
532 //This code is designed for an architecture that read address should
533 // align to the read size (address multiple of 4 if read size is 4)
534 //These few lines take care of the case where data is not at a multiple
535 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
536 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
537 for (int i = 0; i < num; ++i) {
538 ui64 d;
539 //read a byte, 0 if no more data
540 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
541 //check if unstuffing is needed
542 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
543 mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
544 mrp->bits += d_bits;
545 mrp->unstuff = d > 0x8F; // for next byte
546 }
547 rev_read_mrp(mrp);
548 }
549
550 //************************************************************************/
557 static inline
559 {
560 if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
561 {
562 rev_read_mrp(mrp); // read 30-32 bits from mrp
563 if (mrp->bits < 32) // if there is a space of 32 bits
564 rev_read_mrp(mrp); // read more
565 }
566 return (ui32)mrp->tmp; // return the head of mrp->tmp
567 }
568
569 //************************************************************************/
575 inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
576 {
577 assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
578 mrp->tmp >>= num_bits; // discard the lowest num_bits bits
579 mrp->bits -= num_bits;
580 return (ui32)mrp->tmp; // return data after consumption
581 }
582
583 //************************************************************************/
587 struct frwd_struct_avx2 {
588 const ui8* data;
589 ui8 tmp[48];
590 ui32 bits;
591 ui32 unstuff;
592 int size;
593 };
594
595 //************************************************************************/
613 template<int X>
614 static inline
615 void frwd_read(frwd_struct_avx2 *msp)
616 {
617 assert(msp->bits <= 128);
618
619 __m128i offset, val, validity, all_xff;
620 val = _mm_loadu_si128((__m128i*)msp->data);
621 int bytes = msp->size >= 16 ? 16 : msp->size;
622 validity = _mm_set1_epi8((char)bytes);
623 msp->data += bytes;
624 msp->size -= bytes;
625 int bits = 128;
626 offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
627 validity = _mm_cmpgt_epi8(validity, offset);
628 all_xff = _mm_set1_epi8(-1);
629 if (X == 0xFF) // the compiler should remove this if statement
630 {
631 __m128i t = _mm_xor_si128(validity, all_xff); // complement
632 val = _mm_or_si128(t, val); // fill with 0xFF
633 }
634 else if (X == 0)
635 val = _mm_and_si128(validity, val); // fill with zeros
636 else
637 assert(0);
638
639 __m128i ff_bytes;
640 ff_bytes = _mm_cmpeq_epi8(val, all_xff);
641 ff_bytes = _mm_and_si128(ff_bytes, validity);
642 ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes);
643 flags <<= 1; // unstuff following byte
644 ui32 next_unstuff = flags >> 16;
645 flags |= msp->unstuff;
646 flags &= 0xFFFF;
647 while (flags)
648 { // bit unstuffing occurs on average once every 256 bytes
649 // therefore it is not an issue if it is a bit slow
650 // here we process 16 bytes
651 --bits; // consuming one stuffing bit
652
653 ui32 loc = 31 - count_leading_zeros(flags);
654 flags ^= 1 << loc;
655
656 __m128i m, t, c;
657 t = _mm_set1_epi8((char)loc);
658 m = _mm_cmpgt_epi8(offset, t);
659
660 t = _mm_and_si128(m, val); // keep bits at locations larger than loc
661 c = _mm_srli_epi64(t, 1); // 1 bits left
662 t = _mm_srli_si128(t, 8); // 8 bytes left
663 t = _mm_slli_epi64(t, 63); // keep the MSB only
664 t = _mm_or_si128(t, c); // combine the above 3 steps
665
666 val = _mm_or_si128(t, _mm_andnot_si128(m, val));
667 }
668
669 // combine with earlier data
670 assert(msp->bits >= 0 && msp->bits <= 128);
671 int cur_bytes = msp->bits >> 3;
672 int cur_bits = msp->bits & 7;
673 __m128i b1, b2;
674 b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
675 b2 = _mm_slli_si128(val, 8); // 8 bytes right
676 b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
677 b1 = _mm_or_si128(b1, b2);
678 b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
679 b2 = _mm_or_si128(b1, b2);
680 _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
681
682 int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
683 cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up
684 int upper = _mm_extract_epi16(val, 7);
685 upper >>= consumed_bits - 128 + 16;
686 msp->tmp[cur_bytes] = (ui8)upper; // copy byte
687
688 msp->bits += (ui32)bits;
689 msp->unstuff = next_unstuff; // next unstuff
690 assert(msp->unstuff == 0 || msp->unstuff == 1);
691 }
692
693 //************************************************************************/
702 template<int X>
703 static inline
704 void frwd_init(frwd_struct_avx2 *msp, const ui8* data, int size)
705 {
706 msp->data = data;
707 _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
708 _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
709 _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
710
711 msp->bits = 0;
712 msp->unstuff = 0;
713 msp->size = size;
714
715 frwd_read<X>(msp); // read 128 bits more
716 }
717
718 //************************************************************************/
724 static inline
725 void frwd_advance(frwd_struct_avx2 *msp, ui32 num_bits)
726 {
727 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
728 msp->bits -= num_bits;
729
730 __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
731 num_bits &= 63;
732
733 __m128i v0, v1, c0, c1, t;
734 v0 = _mm_loadu_si128(p);
735 v1 = _mm_loadu_si128(p + 1);
736
737 // shift right by num_bits
738 c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
739 t = _mm_srli_si128(v0, 8);
740 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
741 c0 = _mm_or_si128(c0, t);
742 t = _mm_slli_si128(v1, 8);
743 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
744 c0 = _mm_or_si128(c0, t);
745
746 _mm_storeu_si128((__m128i*)msp->tmp, c0);
747
748 c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
749 t = _mm_srli_si128(v1, 8);
750 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
751 c1 = _mm_or_si128(c1, t);
752
753 _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
754 }
755
756 //************************************************************************/
763 template<int X>
764 static inline
765 __m128i frwd_fetch(frwd_struct_avx2 *msp)
766 {
767 if (msp->bits <= 128)
768 {
769 frwd_read<X>(msp);
770 if (msp->bits <= 128) //need to test
771 frwd_read<X>(msp);
772 }
773 __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
774 return t;
775 }
776
777 //************************************************************************/
787 static inline __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct_avx2* magsgn, ui32 p, __m128i& vn) {
788 __m256i row = _mm256_setzero_si256();
789
790 // we keeps e_k, e_1, and rho in w2
791 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
792 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
793
794 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
795 {
796 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
797
798 // U_q holds U_q for this quad
799 // flags has e_k, e_1, and rho such that e_k is sitting in the
800 // 0x8000, e_1 in 0x800, and rho in 0x80
801
802 // next e_k and m_n
803 __m256i m_n;
804 __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k
805 m_n = _mm256_sub_epi32(U_q, w0);
806 m_n = _mm256_andnot_si256(insig, m_n);
807
808 // find cumulative sums
809 // to find at which bit in ms_vec the sample starts
810 __m256i inc_sum = m_n; // inclusive scan
811 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
812 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
813 int total_mn1 = _mm256_extract_epi16(inc_sum, 6);
814 int total_mn2 = _mm256_extract_epi16(inc_sum, 14);
815
816 __m128i ms_vec0 = _mm_setzero_si128();
817 __m128i ms_vec1 = _mm_setzero_si128();
818 if (total_mn1) {
819 ms_vec0 = frwd_fetch<0xFF>(magsgn);
820 frwd_advance(magsgn, (ui32)total_mn1);
821 }
822 if (total_mn2) {
823 ms_vec1 = frwd_fetch<0xFF>(magsgn);
824 frwd_advance(magsgn, (ui32)total_mn2);
825 }
826
827 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
828
829 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan
830
831 // find the starting byte and starting bit
832 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
833 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
834 byte_idx = _mm256_shuffle_epi8(byte_idx,
835 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
836 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
837 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
838 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
839 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
840
841 // shift samples values to correct location
842 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
843
844 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
845 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
846
847 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
848 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
849 d0 = _mm256_mullo_epi16(d0, bit_shift);
850 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
851 d1 = _mm256_mullo_epi16(d1, bit_shift);
852 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
853 d0 = _mm256_or_si256(d0, d1);
854
855 // find location of e_k and mask
856 __m256i shift;
857 __m256i ones = _mm256_set1_epi32(1);
858 __m256i twos = _mm256_set1_epi32(2);
859 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
860 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
861 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
862 w0 = _mm256_sub_epi32(twos, w0);
863 shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
864 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
865
866 // next e_1
867 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
868 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
869 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
870 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
871 w0 = _mm256_slli_epi32(ms_vec, 31); // sign
872 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
873 __m256i tvn = ms_vec;
874 ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2
875 ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1);
876 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
877 row = _mm256_andnot_si256(insig, ms_vec); // significant only
878
879 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
880
881 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
882
883 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
884 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
885 }
886 return row;
887 }
888
889
890 //************************************************************************/
900
901 static inline __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct_avx2* magsgn, ui32 p, __m128i& vn) {
902
903 __m256i w0; // workers
904 __m256i insig; // lanes hold FF's if samples are insignificant
905 __m256i flags; // lanes hold e_k, e_1, and rho
906
907 __m256i row = _mm256_setzero_si256();
908 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
909 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
910 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
911 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
912 // we keeps e_k, e_1, and rho in w2
913 flags = _mm256_and_si256(w0,
914 _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
915 (si16)0x8880, 0x4440, 0x2220, 0x1110,
916 (si16)0x8880, 0x4440, 0x2220, 0x1110,
917 (si16)0x8880, 0x4440, 0x2220, 0x1110));
918 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
919 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
920 {
921 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
922 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
923 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
924 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
925
926 // U_q holds U_q for this quad
927 // flags has e_k, e_1, and rho such that e_k is sitting in the
928 // 0x8000, e_1 in 0x800, and rho in 0x80
929
930 // next e_k and m_n
931 __m256i m_n;
932 w0 = _mm256_srli_epi16(flags, 15); // e_k
933 m_n = _mm256_sub_epi16(U_q_avx, w0);
934 m_n = _mm256_andnot_si256(insig, m_n);
935
936 // find cumulative sums
937 // to find at which bit in ms_vec the sample starts
938 __m256i inc_sum = m_n; // inclusive scan
939 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
940 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
941 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
942 int total_mn1 = _mm256_extract_epi16(inc_sum, 7);
943 int total_mn2 = _mm256_extract_epi16(inc_sum, 15);
944 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan
945
946 __m128i ms_vec0 = _mm_setzero_si128();
947 __m128i ms_vec1 = _mm_setzero_si128();
948 if (total_mn1) {
949 ms_vec0 = frwd_fetch<0xFF>(magsgn);
950 frwd_advance(magsgn, (ui32)total_mn1);
951 }
952 if (total_mn2) {
953 ms_vec1 = frwd_fetch<0xFF>(magsgn);
954 frwd_advance(magsgn, (ui32)total_mn2);
955 }
956
957 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
958
959 // find the starting byte and starting bit
960 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
961 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
962 byte_idx = _mm256_shuffle_epi8(byte_idx,
963 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
964 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
965 0x0606, 0x0404, 0x0202, 0x0000));
966 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
967 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
968 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
969 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
970
971 // shift samples values to correct location
972 __m256i bit_shift = _mm256_shuffle_epi8(
973 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
974 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
975 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
976 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
977 d0 = _mm256_mullo_epi16(d0, bit_shift);
978 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
979 d1 = _mm256_mullo_epi16(d1, bit_shift);
980 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB
981 d0 = _mm256_or_si256(d0, d1);
982
983 // find location of e_k and mask
984 __m256i shift, t0, t1, Uq0, Uq1;
985 __m256i ones = _mm256_set1_epi16(1);
986 __m256i twos = _mm256_set1_epi16(2);
987 __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones);
988 Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
989 Uq1 = _mm256_bsrli_epi128(U_q_m1, 14);
990 w0 = _mm256_sub_epi16(twos, w0);
991 t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1));
992 t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0));
993 {//no _mm256_sllv_epi16 in avx2
994 __m128i t_0_sse = _mm256_castsi256_si128(t0);
995 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0));
996 __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1);
997 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1));
998 t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
999
1000 t_0_sse = _mm256_castsi256_si128(t1);
1001 t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1));
1002 t_1_sse = _mm256_extracti128_si256(t1, 0x1);
1003 t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1));
1004 t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
1005 }
1006 shift = _mm256_or_si256(t0, t1);
1007 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
1008
1009 // next e_1
1010 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
1011 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
1012 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
1013 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
1014 w0 = _mm256_slli_epi16(ms_vec, 15); // sign
1015 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
1016 __m256i tvn = ms_vec;
1017 ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2
1018 ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1);
1019 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
1020 row = _mm256_andnot_si256(insig, ms_vec); // significant only
1021
1022 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
1023
1024 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
1025 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
1026 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
1027 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
1028 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
1029 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
1030 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
1031
1032 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
1033 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
1034 }
1035 return row;
1036 }
1037
1038 // https://stackoverflow.com/a/58827596
1039 inline __m256i avx2_lzcnt_epi32(__m256i v) {
1040 // prevent value from being rounded up to the next power of two
1041 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
1042
1043 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
1044 v = _mm256_srli_epi32(v, 23); // shift down the exponent
1045 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
1046 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
1047
1048 return v;
1049 }
1050
1051 //************************************************************************/
1068 bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
1069 ui32 missing_msbs, ui32 num_passes,
1070 ui32 lengths1, ui32 lengths2,
1071 ui32 width, ui32 height, ui32 stride,
1072 bool stripe_causal)
1073 {
1074 static bool insufficient_precision = false;
1075 static bool modify_code = false;
1076 static bool truncate_spp_mrp = false;
1077
1078 if (num_passes > 1 && lengths2 == 0)
1079 {
1080 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1081 "one coding pass, but zero length for "
1082 "2nd and potential 3rd pass.");
1083 num_passes = 1;
1084 }
1085
1086 if (num_passes > 3)
1087 {
1088 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1089 "This codeblocks has %d passes.",
1090 num_passes);
1091 return false;
1092 }
1093
1094 if (missing_msbs > 30) // p < 0
1095 {
1096 if (insufficient_precision == false)
1097 {
1098 insufficient_precision = true;
1099 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1100 "codeblock. This message will not be "
1101 "displayed again.");
1102 }
1103 return false;
1104 }
1105 else if (missing_msbs == 30) // p == 0
1106 { // not enough precision to decode and set the bin center to 1
1107 if (modify_code == false) {
1108 modify_code = true;
1109 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1110 "pass. The code can be modified to support "
1111 "this case. This message will not be "
1112 "displayed again.");
1113 }
1114 return false; // 32 bits are not enough to decode this
1115 }
1116 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1117 {
1118 if (num_passes > 1) {
1119 num_passes = 1;
1120 if (truncate_spp_mrp == false) {
1121 truncate_spp_mrp = true;
1122 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1123 "nor MagRef passes; both will be skipped. "
1124 "This message will not be displayed "
1125 "again.");
1126 }
1127 }
1128 }
1129 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1130 // There is a way to handle the case of p == 0, but a different path
1131 // is required
1132
1133 if (lengths1 < 2)
1134 {
1135 OJPH_WARN(0x00010006, "Wrong codeblock length.");
1136 return false;
1137 }
1138
1139 // read scup and fix the bytes there
1140 int lcup, scup;
1141 lcup = (int)lengths1; // length of CUP
1142 //scup is the length of MEL + VLC
1143 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1144 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1145 return false;
1146
1147 // The temporary storage scratch holds two types of data in an
1148 // interleaved fashion. The interleaving allows us to use one
1149 // memory pointer.
1150 // We have one entry for a decoded VLC code, and one entry for UVLC.
1151 // Entries are 16 bits each, corresponding to one quad,
1152 // but since we want to use XMM registers of the SSE family
1153 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1154 // the width is no smaller than 16 bytes (or 8 entries), and the
1155 // height is 512 quads
1156 // Each VLC entry contains, in the following order, starting
1157 // from MSB
1158 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1159 // Each entry in UVLC contains u_q
1160 // One extra row to handle the case of SPP propagating downwards
1161 // when codeblock width is 4
1162 ui16 scratch[8 * 513] = {0}; // 8+ kB
1163
1164 // We need an extra two entries (one inf and one u_q) beyond
1165 // the last column.
1166 // If the block width is 4 (2 quads), then we use sstr of 8
1167 // (enough for 4 quads). If width is 8 (4 quads) we use
1168 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1169 // quads), we use 24 (enough for 12 quads).
1170 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1171
1172 assert((stride & 0x3) == 0);
1173
1174 ui32 mmsbp2 = missing_msbs + 2;
1175
1176 // The cleanup pass is decoded in two steps; in step one,
1177 // the VLC and MEL segments are decoded, generating a record that
1178 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1179 // This information should be sufficient for the next step.
1180 // In step 2, we decode the MagSgn segment.
1181
1182 // step 1 decoding VLC and MEL segments
1183 {
1184 // init structures
1185 dec_mel_st mel;
1186 mel_init(&mel, coded_data, lcup, scup);
1187 rev_struct vlc;
1188 rev_init(&vlc, coded_data, lcup, scup);
1189
1190 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1191 // data represented as runs of 0 events
1192 // See mel_decode description
1193
1194 ui32 vlc_val;
1195 ui32 c_q = 0;
1196 ui16 *sp = scratch;
1197 //initial quad row
1198 for (ui32 x = 0; x < width; sp += 4)
1199 {
1200 // decode VLC
1202
1203 // first quad
1204 vlc_val = rev_fetch(&vlc);
1205
1206 //decode VLC using the context c_q and the head of VLC bitstream
1207 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1208
1209 // if context is zero, use one MEL event
1210 if (c_q == 0) //zero context
1211 {
1212 run -= 2; //subtract 2, since events number if multiplied by 2
1213
1214 // Is the run terminated in 1? if so, use decoded VLC code,
1215 // otherwise, discard decoded data, since we will decoded again
1216 // using a different context
1217 t0 = (run == -1) ? t0 : 0;
1218
1219 // is run -1 or -2? this means a run has been consumed
1220 if (run < 0)
1221 run = mel_get_run(&mel); // get another run
1222 }
1223 //run -= (c_q == 0) ? 2 : 0;
1224 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1225 //if (run < 0)
1226 // run = mel_get_run(&mel); // get another run
1227 sp[0] = t0;
1228 x += 2;
1229
1230 // prepare context for the next quad; eqn. 1 in ITU T.814
1231 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1232
1233 //remove data from vlc stream (0 bits are removed if vlc is not used)
1234 vlc_val = rev_advance(&vlc, t0 & 0x7);
1235
1236 //second quad
1237 ui16 t1 = 0;
1238
1239 //decode VLC using the context c_q and the head of VLC bitstream
1240 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1241
1242 // if context is zero, use one MEL event
1243 if (c_q == 0 && x < width) //zero context
1244 {
1245 run -= 2; //subtract 2, since events number if multiplied by 2
1246
1247 // if event is 0, discard decoded t1
1248 t1 = (run == -1) ? t1 : 0;
1249
1250 if (run < 0) // have we consumed all events in a run
1251 run = mel_get_run(&mel); // if yes, then get another run
1252 }
1253 t1 = x < width ? t1 : 0;
1254 //run -= (c_q == 0 && x < width) ? 2 : 0;
1255 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1256 //if (run < 0)
1257 // run = mel_get_run(&mel); // get another run
1258 sp[2] = t1;
1259 x += 2;
1260
1261 //prepare context for the next quad, eqn. 1 in ITU T.814
1262 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1263
1264 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1265 vlc_val = rev_advance(&vlc, t1 & 0x7);
1266
1267 // decode u
1269 // uvlc_mode is made up of u_offset bits from the quad pair
1270 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1271 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1272 { // the MEL run of events
1273 run -= 2; //subtract 2, since events number if multiplied by 2
1274
1275 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1276 // is 0x40
1277
1278 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1279 run = mel_get_run(&mel);
1280 }
1281 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1282 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1283 //if (run < 0)
1284 // run = mel_get_run(&mel); // get another run
1285
1286 //decode uvlc_mode to get u for both quads
1287 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1288 //remove total prefix length
1289 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1290 uvlc_entry >>= 3;
1291 //extract suffixes for quad 0 and 1
1292 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1293 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1294 vlc_val = rev_advance(&vlc, len);
1295 ojph_unused(vlc_val); //static code analysis: unused value
1296 uvlc_entry >>= 4;
1297 // quad 0 length
1298 len = uvlc_entry & 0x7; // quad 0 suffix length
1299 uvlc_entry >>= 3;
1300 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1301 sp[1] = u_q;
1302 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1303 sp[3] = u_q;
1304 }
1305 sp[0] = sp[1] = 0;
1306
1307 //non initial quad rows
1308 for (ui32 y = 2; y < height; y += 2)
1309 {
1310 c_q = 0; // context
1311 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1312
1313 for (ui32 x = 0; x < width; sp += 4)
1314 {
1315 // decode VLC
1317
1318 // sigma_q (n, ne, nf)
1319 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1320 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1321
1322 // first quad
1323 vlc_val = rev_fetch(&vlc);
1324
1325 //decode VLC using the context c_q and the head of VLC bitstream
1326 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1327
1328 // if context is zero, use one MEL event
1329 if (c_q == 0) //zero context
1330 {
1331 run -= 2; //subtract 2, since events number is multiplied by 2
1332
1333 // Is the run terminated in 1? if so, use decoded VLC code,
1334 // otherwise, discard decoded data, since we will decoded again
1335 // using a different context
1336 t0 = (run == -1) ? t0 : 0;
1337
1338 // is run -1 or -2? this means a run has been consumed
1339 if (run < 0)
1340 run = mel_get_run(&mel); // get another run
1341 }
1342 //run -= (c_q == 0) ? 2 : 0;
1343 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1344 //if (run < 0)
1345 // run = mel_get_run(&mel); // get another run
1346 sp[0] = t0;
1347 x += 2;
1348
1349 // prepare context for the next quad; eqn. 2 in ITU T.814
1350 // sigma_q (w, sw)
1351 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1352 // sigma_q (nw)
1353 c_q |= sp[0 - (si32)sstr] & 0x80;
1354 // sigma_q (n, ne, nf)
1355 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1356 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1357
1358 //remove data from vlc stream (0 bits are removed if vlc is unused)
1359 vlc_val = rev_advance(&vlc, t0 & 0x7);
1360
1361 //second quad
1362 ui16 t1 = 0;
1363
1364 //decode VLC using the context c_q and the head of VLC bitstream
1365 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1366
1367 // if context is zero, use one MEL event
1368 if (c_q == 0 && x < width) //zero context
1369 {
1370 run -= 2; //subtract 2, since events number if multiplied by 2
1371
1372 // if event is 0, discard decoded t1
1373 t1 = (run == -1) ? t1 : 0;
1374
1375 if (run < 0) // have we consumed all events in a run
1376 run = mel_get_run(&mel); // if yes, then get another run
1377 }
1378 t1 = x < width ? t1 : 0;
1379 //run -= (c_q == 0 && x < width) ? 2 : 0;
1380 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1381 //if (run < 0)
1382 // run = mel_get_run(&mel); // get another run
1383 sp[2] = t1;
1384 x += 2;
1385
1386 // partial c_q, will be completed when we process the next quad
1387 // sigma_q (w, sw)
1388 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1389 // sigma_q (nw)
1390 c_q |= sp[2 - (si32)sstr] & 0x80;
1391
1392 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1393 vlc_val = rev_advance(&vlc, t1 & 0x7);
1394
1395 // decode u
1397 // uvlc_mode is made up of u_offset bits from the quad pair
1398 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1399 ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1400 //remove total prefix length
1401 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1402 uvlc_entry >>= 3;
1403 //extract suffixes for quad 0 and 1
1404 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1405 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1406 vlc_val = rev_advance(&vlc, len);
1407 ojph_unused(vlc_val); //static code analysis: unused value
1408 uvlc_entry >>= 4;
1409 // quad 0 length
1410 len = uvlc_entry & 0x7; // quad 0 suffix length
1411 uvlc_entry >>= 3;
1412 ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1413 sp[1] = u_q;
1414 u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1415 sp[3] = u_q;
1416 }
1417 sp[0] = sp[1] = 0;
1418 }
1419 }
1420
1421 // step2 we decode magsgn
1422 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1423 // The 32 bit path decode 16 bits data, for which one would think
1424 // 16 bits are enough, because we want to put in the center of the
1425 // bin.
1426 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1427 // no bitplanes are missing, then we can decoding using the 16 bit
1428 // path, but we are not doing this here.
1429 if (mmsbp2 >= 16)
1430 {
1431 // We allocate a scratch row for storing v_n values.
1432 // We have 512 quads horizontally.
1433 // We may go beyond the last entry by up to 4 entries.
1434 // Here we allocate additional 8 entries.
1435 // There are two rows in this structure, the bottom
1436 // row is used to store processed entries.
1437 const int v_n_size = 512 + 16;
1438 ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1439
1440 frwd_struct_avx2 magsgn;
1441 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1442
1443 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
1444
1445 {
1446 ui16 *sp = scratch;
1447 ui32 *vp = v_n_scratch;
1448 ui32 *dp = decoded_data;
1449 vp[0] = 2; // for easy calculation of emax
1450
1451 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1452 {
1453 __m128i vn = _mm_set1_epi32(2);
1454
1455 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1456 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1457
1458 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1459 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1460 if (!_mm256_testz_si256(w, w)) {
1461 return false;
1462 }
1463
1464 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
1465 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1466 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1467 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1468
1469 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1470 w0 = _mm_or_si128(w0, vn);
1471 _mm_storeu_si128((__m128i*)vp, w0);
1472 }
1473 }
1474
1475 for (ui32 y = 2; y < height; y += 2)
1476 {
1477 {
1478 // perform 31 - count_leading_zeros(*vp) here
1479 ui32 *vp = v_n_scratch;
1480 ui16* sp = scratch + (y >> 1) * sstr;
1481
1482 const __m256i avx_31 = _mm256_set1_epi32(31);
1483 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1484 const __m256i avx_1 = _mm256_set1_epi32(1);
1485 const __m256i avx_0 = _mm256_setzero_si256();
1486
1487 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1488 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1489 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1490 v = _mm256_or_si256(v, v_p1);
1491 v = avx2_lzcnt_epi32(v);
1492 v = _mm256_sub_epi32(avx_31, v);
1493
1494 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1495 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1496 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1497 gamma = _mm256_and_si256(gamma, w0);
1498 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1499
1500 v = _mm256_andnot_si256(gamma, v);
1501 v = _mm256_max_epi32(v, avx_1);
1502
1503 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1504 v = _mm256_add_epi32(inf_u_q, v);
1505
1506 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1507 if (!_mm256_testz_si256(w0, w0)) {
1508 return false;
1509 }
1510
1511 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1512 }
1513 }
1514
1515 ui32 *vp = v_n_scratch;
1516 ui16 *sp = scratch + (y >> 1) * sstr;
1517 ui32 *dp = decoded_data + y * stride;
1518 vp[0] = 2; // for easy calculation of emax
1519
1520 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1521 //process two quads
1522 __m128i vn = _mm_set1_epi32(2);
1523
1524 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1525 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1526
1527 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1528 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1529
1530 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
1531 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1532 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1533 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1534
1535 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1536 w0 = _mm_or_si128(w0, vn);
1537 _mm_storeu_si128((__m128i*)vp, w0);
1538 }
1539 }
1540 }
1541 else {
1542
1543 // reduce bitplane by 16 because we now have 16 bits instead of 32
1544 p -= 16;
1545
1546 // We allocate a scratch row for storing v_n values.
1547 // We have 512 quads horizontally.
1548 // We may go beyond the last entry by up to 8 entries.
1549 // Therefore we allocate additional 8 entries.
1550 // There are two rows in this structure, the bottom
1551 // row is used to store processed entries.
1552 const int v_n_size = 512 + 16;
1553 ui16 v_n_scratch[v_n_size] = {0}; // 1+ kB
1554 ui32 v_n_scratch_32[v_n_size] = {0}; // 2+ kB
1555
1556 frwd_struct_avx2 magsgn;
1557 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1558
1559 {
1560 ui16 *sp = scratch;
1561 ui16 *vp = v_n_scratch;
1562 ui32 *dp = decoded_data;
1563 vp[0] = 2; // for easy calculation of emax
1564
1565 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
1567 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1568 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
1569 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1570 if (!_mm_testz_si128(w, w)) {
1571 return false;
1572 }
1573
1574 __m128i vn = _mm_set1_epi16(2);
1575 __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
1576
1577 w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
1578 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1579
1580 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1581 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1582
1583 _mm256_storeu_si256((__m256i*)dp, w0);
1584 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1585 }
1586 }
1587
1588 for (ui32 y = 2; y < height; y += 2) {
1589 {
1590 // perform 15 - count_leading_zeros(*vp) here
1591 ui16 *vp = v_n_scratch;
1592 ui32 *vp_32 = v_n_scratch_32;
1593
1594 ui16* sp = scratch + (y >> 1) * sstr;
1595 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
1596 const __m256i avx_31 = _mm256_set1_epi32(31);
1597 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1598 const __m256i avx_1 = _mm256_set1_epi32(1);
1599 const __m256i avx_0 = _mm256_setzero_si256();
1600
1601 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
1602 __m128i v = _mm_loadu_si128((__m128i*)vp);
1603 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
1604 v = _mm_or_si128(v, v_p1);
1605
1606 __m256i v_avx = _mm256_cvtepu16_epi32(v);
1607 v_avx = avx2_lzcnt_epi32(v_avx);
1608 v_avx = _mm256_sub_epi32(avx_31, v_avx);
1609
1610 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1611 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1612 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1613 gamma = _mm256_and_si256(gamma, w0);
1614 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1615
1616 v_avx = _mm256_andnot_si256(gamma, v_avx);
1617 v_avx = _mm256_max_epi32(v_avx, avx_1);
1618
1619 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1620 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
1621
1622 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
1623 if (!_mm256_testz_si256(w0, w0)) {
1624 return false;
1625 }
1626
1627 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
1628 }
1629 }
1630
1631 ui16 *vp = v_n_scratch;
1632 ui32* vp_32 = v_n_scratch_32;
1633 ui16 *sp = scratch + (y >> 1) * sstr;
1634 ui32 *dp = decoded_data + y * stride;
1635 vp[0] = 2; // for easy calculation of emax
1636
1637 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1639 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1640 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1641
1642 __m128i vn = _mm_set1_epi16(2);
1643 __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
1644
1645 __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
1646 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1647
1648 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1649 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1650
1651 _mm256_storeu_si256((__m256i*)dp, w0);
1652 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1653 }
1654 }
1655
1656 // increase bitplane back by 16 because we need to process 32 bits
1657 p += 16;
1658 }
1659
1660 if (num_passes > 1)
1661 {
1662 // We use scratch again, we can divide it into multiple regions
1663 // sigma holds all the significant samples, and it cannot
1664 // be modified after it is set. it will be used during the
1665 // Magnitude Refinement Pass
1666 ui16* const sigma = scratch;
1667
1668 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1669 // ui16 contains 4 columns
1670 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1671
1672 // We re-arrange quad significance, where each 4 consecutive
1673 // bits represent one quad, into column significance, where,
1674 // each 4 consequtive bits represent one column of 4 rows
1675 {
1676 ui32 y;
1677
1678 const __m128i mask_3 = _mm_set1_epi32(0x30);
1679 const __m128i mask_C = _mm_set1_epi32(0xC0);
1680 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1681 for (y = 0; y < height; y += 4)
1682 {
1683 ui16* sp = scratch + (y >> 1) * sstr;
1684 ui16* dp = sigma + (y >> 2) * mstr;
1685 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1686 {
1687 __m128i s0, s1, u3, uC, t0, t1;
1688
1689 s0 = _mm_loadu_si128((__m128i*)(sp));
1690 u3 = _mm_and_si128(s0, mask_3);
1691 u3 = _mm_srli_epi32(u3, 4);
1692 uC = _mm_and_si128(s0, mask_C);
1693 uC = _mm_srli_epi32(uC, 2);
1694 t0 = _mm_or_si128(u3, uC);
1695
1696 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1697 u3 = _mm_and_si128(s1, mask_3);
1698 u3 = _mm_srli_epi32(u3, 2);
1699 uC = _mm_and_si128(s1, mask_C);
1700 t1 = _mm_or_si128(u3, uC);
1701
1702 __m128i r = _mm_or_si128(t0, t1);
1703 r = _mm_shuffle_epi8(r, shuffle_mask);
1704
1705 *(ui32*)dp = (ui32)_mm_extract_epi32(r, 0);
1706 }
1707 dp[0] = 0; // set an extra entry on the right with 0
1708 }
1709 {
1710 // reset one row after the codeblock
1711 ui16* dp = sigma + (y >> 2) * mstr;
1712 __m128i zero = _mm_setzero_si128();
1713 for (ui32 x = 0; x < width; x += 32, dp += 8)
1714 _mm_storeu_si128((__m128i*)dp, zero);
1715 dp[0] = 0; // set an extra entry on the right with 0
1716 }
1717 }
1718
1719 // We perform Significance Propagation Pass here
1720 {
1721 // This stores significance information of the previous
1722 // 4 rows. Significance information in this array includes
1723 // all signicant samples in bitplane p - 1; that is,
1724 // significant samples for bitplane p (discovered during the
1725 // cleanup pass and stored in sigma) and samples that have recently
1726 // became significant (during the SPP) in bitplane p-1.
1727 // We store enough for the widest row, containing 1024 columns,
1728 // which is equivalent to 256 of ui16, since each stores 4 columns.
1729 // We add an extra 8 entries, just in case we need more
1730 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1731
1732 frwd_struct_avx2 sigprop;
1733 frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1734
1735 for (ui32 y = 0; y < height; y += 4)
1736 {
1737 ui32 pattern = 0xFFFFu; // a pattern needed samples
1738 if (height - y < 4) {
1739 pattern = 0x7777u;
1740 if (height - y < 3) {
1741 pattern = 0x3333u;
1742 if (height - y < 2)
1743 pattern = 0x1111u;
1744 }
1745 }
1746
1747 // prev holds sign. info. for the previous quad, together
1748 // with the rows on top of it and below it.
1749 ui32 prev = 0;
1750 ui16 *prev_sig = prev_row_sig;
1751 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1752 ui32 *dpp = decoded_data + y * stride;
1753 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1754 {
1755 // only rows and columns inside the stripe are included
1756 si32 s = (si32)x + 4 - (si32)width;
1757 s = ojph_max(s, 0);
1758 pattern = pattern >> (s * 4);
1759
1760 // We first find locations that need to be tested (potential
1761 // SPP members); these location will end up in mbr
1762 // In each iteration, we produce 16 bits because cwd can have
1763 // up to 16 bits of significance information, followed by the
1764 // corresponding 16 bits of sign information; therefore, it is
1765 // sufficient to fetch 32 bit data per loop.
1766
1767 // Althougth we are interested in 16 bits only, we load 32 bits.
1768 // For the 16 bits we are producing, we need the next 4 bits --
1769 // We need data for at least 5 columns out of 8.
1770 // Therefore loading 32 bits is easier than loading 16 bits
1771 // twice.
1772 ui32 ps = *(ui32*)prev_sig;
1773 ui32 ns = *(ui32*)(cur_sig + mstr);
1774 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1775 if (!stripe_causal)
1776 u |= (ns & 0x11111111) << 3; // the row below
1777
1778 ui32 cs = *(ui32*)cur_sig;
1779 // vertical integration
1780 ui32 mbr = cs; // this sig. info.
1781 mbr |= (cs & 0x77777777) << 1; //above neighbors
1782 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1783 mbr |= u;
1784 // horizontal integration
1785 ui32 t = mbr;
1786 mbr |= t << 4; // neighbors on the left
1787 mbr |= t >> 4; // neighbors on the right
1788 mbr |= prev >> 12; // significance of previous group
1789
1790 // remove outside samples, and already significant samples
1791 mbr &= pattern;
1792 mbr &= ~cs;
1793
1794 // find samples that become significant during the SPP
1795 ui32 new_sig = mbr;
1796 if (new_sig)
1797 {
1798 __m128i cwd_vec = frwd_fetch<0>(&sigprop);
1799 ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0);
1800
1801 ui32 cnt = 0;
1802 ui32 col_mask = 0xFu;
1803 ui32 inv_sig = ~cs & pattern;
1804 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1805 {
1806 if ((col_mask & new_sig) == 0)
1807 continue;
1808
1809 //scan one column
1810 ui32 sample_mask = 0x1111u & col_mask;
1811 if (new_sig & sample_mask)
1812 {
1813 new_sig &= ~sample_mask;
1814 if (cwd & 1)
1815 {
1816 ui32 t = 0x33u << i;
1817 new_sig |= t & inv_sig;
1818 }
1819 cwd >>= 1; ++cnt;
1820 }
1821
1822 sample_mask <<= 1;
1823 if (new_sig & sample_mask)
1824 {
1825 new_sig &= ~sample_mask;
1826 if (cwd & 1)
1827 {
1828 ui32 t = 0x76u << i;
1829 new_sig |= t & inv_sig;
1830 }
1831 cwd >>= 1; ++cnt;
1832 }
1833
1834 sample_mask <<= 1;
1835 if (new_sig & sample_mask)
1836 {
1837 new_sig &= ~sample_mask;
1838 if (cwd & 1)
1839 {
1840 ui32 t = 0xECu << i;
1841 new_sig |= t & inv_sig;
1842 }
1843 cwd >>= 1; ++cnt;
1844 }
1845
1846 sample_mask <<= 1;
1847 if (new_sig & sample_mask)
1848 {
1849 new_sig &= ~sample_mask;
1850 if (cwd & 1)
1851 {
1852 ui32 t = 0xC8u << i;
1853 new_sig |= t & inv_sig;
1854 }
1855 cwd >>= 1; ++cnt;
1856 }
1857 }
1858
1859 if (new_sig)
1860 {
1861 cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1862
1863 // Spread new_sig, such that each bit is in one byte with a
1864 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1865 __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
1866 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1867 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1868 new_sig_vec = _mm_and_si128(new_sig_vec,
1869 _mm_set1_epi64x((si64)0x8040201008040201));
1870 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1871 _mm_set1_epi64x((si64)0x8040201008040201));
1872
1873 // find cumulative sums
1874 // to find which bit in cwd we should extract
1875 __m128i inc_sum = new_sig_vec; // inclusive scan
1876 inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
1877 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1878 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1879 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1880 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1881 cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1882 // exclusive scan
1883 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1884
1885 // Spread cwd, such that each bit is in one byte
1886 // with a value of 0 or 1.
1887 cwd_vec = _mm_set1_epi16((si16)cwd);
1888 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1889 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1890 cwd_vec = _mm_and_si128(cwd_vec,
1891 _mm_set1_epi64x((si64)0x8040201008040201));
1892 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1893 _mm_set1_epi64x((si64)0x8040201008040201));
1894 cwd_vec = _mm_abs_epi8(cwd_vec);
1895
1896 // Obtain bit from cwd_vec correspondig to ex_sum
1897 // Basically, collect needed bits from cwd_vec
1898 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1899
1900 // load data and set spp coefficients
1901 __m128i m =
1902 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1903 __m128i val = _mm_set1_epi32(3 << (p - 2));
1904 ui32 *dp = dpp;
1905 for (int c = 0; c < 4; ++ c) {
1906 __m128i s0, s0_ns, s0_val;
1907 // load coefficients
1908 s0 = _mm_load_si128((__m128i*)dp);
1909
1910 // epi32 is -1 only for coefficient that
1911 // are changed during the SPP
1912 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1913 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1914
1915 // obtain sign for coefficients in SPP
1916 s0_val = _mm_shuffle_epi8(v, m);
1917 s0_val = _mm_slli_epi32(s0_val, 31);
1918 s0_val = _mm_or_si128(s0_val, val);
1919 s0_val = _mm_and_si128(s0_val, s0_ns);
1920
1921 // update vector
1922 s0 = _mm_or_si128(s0, s0_val);
1923 // store coefficients
1924 _mm_store_si128((__m128i*)dp, s0);
1925 // prepare for next row
1926 dp += stride;
1927 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1928 }
1929 }
1930 frwd_advance(&sigprop, cnt);
1931 }
1932
1933 new_sig |= cs;
1934 *prev_sig = (ui16)(new_sig);
1935
1936 // vertical integration for the new sig. info.
1937 t = new_sig;
1938 new_sig |= (t & 0x7777) << 1; //above neighbors
1939 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
1940 // add sig. info. from the row on top and below
1941 prev = new_sig | u;
1942 // we need only the bits in 0xF000
1943 prev &= 0xF000;
1944 }
1945 }
1946 }
1947
1948 // We perform Magnitude Refinement Pass here
1949 if (num_passes > 2)
1950 {
1951 rev_struct magref;
1952 rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
1953
1954 for (ui32 y = 0; y < height; y += 4)
1955 {
1956 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1957 ui32 *dpp = decoded_data + y * stride;
1958 for (ui32 i = 0; i < width; i += 4, dpp += 4)
1959 {
1960 //Process one entry from sigma array at a time
1961 // Each nibble (4 bits) in the sigma array represents 4 rows,
1962 ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
1963 ui16 sig = *cur_sig++; // 16 bit that will be processed now
1964 int total_bits = 0;
1965 if (sig) // if any of the 32 bits are set
1966 {
1967 // We work on 4 rows, with 4 samples each, since
1968 // data is 32 bit (4 bytes)
1969
1970 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
1971 __m128i sig_vec = _mm_set1_epi16((si16)sig);
1972 sig_vec = _mm_shuffle_epi8(sig_vec,
1973 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1974 sig_vec = _mm_and_si128(sig_vec,
1975 _mm_set1_epi64x((si64)0x8040201008040201));
1976 sig_vec = _mm_cmpeq_epi8(sig_vec,
1977 _mm_set1_epi64x((si64)0x8040201008040201));
1978 sig_vec = _mm_abs_epi8(sig_vec);
1979
1980 // find cumulative sums
1981 // to find which bit in cwd we should extract
1982 __m128i inc_sum = sig_vec; // inclusive scan
1983 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1984 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1985 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1986 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1987 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1988 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
1989
1990 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
1991 // cwd_vec. Then, convert these to a form suitable
1992 // for coefficient modifications; in particular, a value
1993 // of 0 is presented as binary 11, and a value of 1 is
1994 // represented as binary 01
1995 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
1996 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1997 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1998 cwd_vec = _mm_and_si128(cwd_vec,
1999 _mm_set1_epi64x((si64)0x8040201008040201));
2000 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2001 _mm_set1_epi64x((si64)0x8040201008040201));
2002 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2003 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2004 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2005
2006 // load data and insert the mrp bit
2007 __m128i m =
2008 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2009 ui32 *dp = dpp;
2010 for (int c = 0; c < 4; ++c) {
2011 __m128i s0, s0_sig, s0_idx, s0_val;
2012 // load coefficients
2013 s0 = _mm_load_si128((__m128i*)dp);
2014 // find significant samples in this row
2015 s0_sig = _mm_shuffle_epi8(sig_vec, m);
2016 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2017 // get MRP bit index, and MRP pattern
2018 s0_idx = _mm_shuffle_epi8(ex_sum, m);
2019 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2020 // keep data from significant samples only
2021 s0_val = _mm_andnot_si128(s0_sig, s0_val);
2022 // move mrp bits to correct position, and employ
2023 s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
2024 s0 = _mm_xor_si128(s0, s0_val);
2025 // store coefficients
2026 _mm_store_si128((__m128i*)dp, s0);
2027 // prepare for next row
2028 dp += stride;
2029 m = _mm_add_epi32(m, _mm_set1_epi32(1));
2030 }
2031 }
2032 // consume data according to the number of bits set
2033 rev_advance_mrp(&magref, (ui32)total_bits);
2034 }
2035 }
2036 }
2037 }
2038
2039 return true;
2040 }
2041 }
2042}
2043
2044#endif
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct32 *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct32.
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static void frwd_read(frwd_struct32 *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
static ui32 frwd_fetch(frwd_struct32 *msp)
Fetches 32 bits from the frwd_struct32 bitstream.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_init(frwd_struct32 *msp, const ui8 *data, int size)
Initialize frwd_struct32 struct and reads some bytes.
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition ojph_arch.h:173
int32_t si32
Definition ojph_defs.h:55
int16_t si16
Definition ojph_defs.h:53
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define ojph_unused(x)
Definition ojph_defs.h:78
#define OJPH_WARN(t,...)
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data