OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_decoder_ssse3.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_block_decoder_ssse3.cpp
34// Author: Aous Naman
35// Date: 13 May 2022
36//***************************************************************************/
37
38//***************************************************************************/
42
43#include "ojph_arch.h"
44#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
45
46#include <string>
47#include <iostream>
48
49#include <cassert>
50#include <cstring>
51#include "ojph_block_common.h"
52#include "ojph_block_decoder.h"
53#include "ojph_message.h"
54
55#include <immintrin.h>
56
57namespace ojph {
58 namespace local {
59
60 //************************************************************************/
67 struct dec_mel_st {
68 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
69 k(0), num_runs(0), runs(0)
70 {}
71 // data decoding machinery
72 ui8* data;
73 ui64 tmp;
74 int bits;
75 int size;
76 bool unstuff;
77 int k;
78
79 // queue of decoded runs
80 int num_runs;
81 ui64 runs;
82 };
83
84 //************************************************************************/
96 static inline
97 void mel_read(dec_mel_st *melp)
98 {
99 if (melp->bits > 32) //there are enough bits in the tmp variable
100 return; // return without reading new data
101
102 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
103 if (melp->size > 4) { // if there is data in the MEL segment
104 val = *(ui32*)melp->data; // read 32 bits from MEL data
105 melp->data += 4; // advance pointer
106 melp->size -= 4; // reduce counter
107 }
108 else if (melp->size > 0)
109 { // 4 or less
110 int i = 0;
111 while (melp->size > 1) {
112 ui32 v = *melp->data++; // read one byte at a time
113 ui32 m = ~(0xFFu << i); // mask of location
114 val = (val & m) | (v << i);// put one byte in its correct location
115 --melp->size;
116 i += 8;
117 }
118 // size equal to 1
119 ui32 v = *melp->data++; // the one before the last is different
120 v |= 0xF; // MEL and VLC segments can overlap
121 ui32 m = ~(0xFFu << i);
122 val = (val & m) | (v << i);
123 --melp->size;
124 }
125
126 // next we unstuff them before adding them to the buffer
127 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
128 // the previously read byte requires
129 // unstuffing
130
131 // data is unstuffed and accumulated in t
132 // bits has the number of bits in t
133 ui32 t = val & 0xFF;
134 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
135 bits -= unstuff; // there is one less bit in t if unstuffing is needed
136 t = t << (8 - unstuff); // move up to make room for the next byte
137
138 //this is a repeat of the above
139 t |= (val>>8) & 0xFF;
140 unstuff = (((val >> 8) & 0xFF) == 0xFF);
141 bits -= unstuff;
142 t = t << (8 - unstuff);
143
144 t |= (val>>16) & 0xFF;
145 unstuff = (((val >> 16) & 0xFF) == 0xFF);
146 bits -= unstuff;
147 t = t << (8 - unstuff);
148
149 t |= (val>>24) & 0xFF;
150 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
151
152 // move t to tmp, and push the result all the way up, so we read from
153 // the MSB
154 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
155 melp->bits += bits; //increment the number of bits in tmp
156 }
157
158 //************************************************************************/
173 static inline
174 void mel_decode(dec_mel_st *melp)
175 {
176 static const int mel_exp[13] = { //MEL exponents
177 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
178 };
179
180 if (melp->bits < 6) // if there are less than 6 bits in tmp
181 mel_read(melp); // then read from the MEL bitstream
182 // 6 bits is the largest decodable MEL cwd
183
184 //repeat so long that there is enough decodable bits in tmp,
185 // and the runs store is not full (num_runs < 8)
186 while (melp->bits >= 6 && melp->num_runs < 8)
187 {
188 int eval = mel_exp[melp->k]; // number of bits associated with state
189 int run = 0;
190 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
191 { //one is found
192 run = 1 << eval;
193 run--; // consecutive runs of 0 events - 1
194 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
195 melp->tmp <<= 1; // consume one bit from tmp
196 melp->bits -= 1;
197 run = run << 1; // a stretch of zeros not terminating in one
198 }
199 else
200 { //0 is found
201 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
202 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
203 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
204 melp->bits -= eval + 1;
205 run = (run << 1) + 1; // a stretch of zeros terminating with one
206 }
207 eval = melp->num_runs * 7; // 7 bits per run
208 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
209 melp->runs |= ((ui64)run) << eval; // store the value in runs
210 melp->num_runs++; // increment count
211 }
212 }
213
214 //************************************************************************/
224 static inline
225 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
226 {
227 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
228 melp->bits = 0; // 0 bits in tmp
229 melp->tmp = 0; //
230 melp->unstuff = false; // no unstuffing
231 melp->size = scup - 1; // size is the length of MEL+VLC-1
232 melp->k = 0; // 0 for state
233 melp->num_runs = 0; // num_runs is 0
234 melp->runs = 0; //
235
236 //This code is borrowed; original is for a different architecture
237 //These few lines take care of the case where data is not at a multiple
238 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
239 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
240 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
241 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
242 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
243 //set data to 0xFF
244 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
245 // see the standard
246 melp->data += melp->size-- > 0; //increment if the end is not reached
247 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
248 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
249 melp->bits += d_bits; //increment tmp by number of bits
250 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
251 //unstuffing
252 }
253 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
254 // is the MSB
255 }
256
257 //************************************************************************/
263 static inline
264 int mel_get_run(dec_mel_st *melp)
265 {
266 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
267 mel_decode(melp);
268
269 int t = melp->runs & 0x7F; //retrieve one run
270 melp->runs >>= 7; // remove the retrieved run
271 melp->num_runs--;
272 return t; // return run
273 }
274
275 //************************************************************************/
279 struct rev_struct {
280 rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
281 {}
282 //storage
283 ui8* data;
284 ui64 tmp;
285 ui32 bits;
286 int size;
287 bool unstuff;
289 };
290
291 //************************************************************************/
311 static inline
312 void rev_read(rev_struct *vlcp)
313 {
314 //process 4 bytes at a time
315 if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
316 return; // reading 32 bits can overflow vlcp->tmp
317 ui32 val = 0;
318 //the next line (the if statement) needs to be tested first
319 if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
320 {
321 // (vlcp->data - 3) move pointer back to read 32 bits at once
322 val = *(ui32*)(vlcp->data - 3); // then read 32 bits
323 vlcp->data -= 4; // move data pointer back by 4
324 vlcp->size -= 4; // reduce available byte by 4
325 }
326 else if (vlcp->size > 0)
327 { // 4 or less
328 int i = 24;
329 while (vlcp->size > 0) {
330 ui32 v = *vlcp->data--; // read one byte at a time
331 val |= (v << i); // put byte in its correct location
332 --vlcp->size;
333 i -= 8;
334 }
335 }
336
337 //accumulate in tmp, number of bits in tmp are stored in bits
338 ui32 tmp = val >> 24; //start with the MSB byte
339 ui32 bits;
340
341 // test unstuff (previous byte is >0x8F), and this byte is 0x7F
342 bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
343 bool unstuff = (val >> 24) > 0x8F; //this is for the next byte
344
345 tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
346 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
347 unstuff = ((val >> 16) & 0xFF) > 0x8F;
348
349 tmp |= ((val >> 8) & 0xFF) << bits;
350 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
351 unstuff = ((val >> 8) & 0xFF) > 0x8F;
352
353 tmp |= (val & 0xFF) << bits;
354 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
355 unstuff = (val & 0xFF) > 0x8F;
356
357 // now move the read and unstuffed bits into vlcp->tmp
358 vlcp->tmp |= (ui64)tmp << vlcp->bits;
359 vlcp->bits += bits;
360 vlcp->unstuff = unstuff; // this for the next read
361 }
362
363 //************************************************************************/
377 static inline
378 void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
379 {
380 //first byte has only the upper 4 bits
381 vlcp->data = data + lcup - 2;
382
383 //size can not be larger than this, in fact it should be smaller
384 vlcp->size = scup - 2;
385
386 ui32 d = *vlcp->data--; // read one byte (this is a half byte)
387 vlcp->tmp = d >> 4; // both initialize and set
388 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
389 vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
390
391 //This code is designed for an architecture that read address should
392 // align to the read size (address multiple of 4 if read size is 4)
393 //These few lines take care of the case where data is not at a multiple
394 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
395 // To read 32 bits, read from (vlcp->data - 3)
396 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
397 int tnum = num < vlcp->size ? num : vlcp->size;
398 for (int i = 0; i < tnum; ++i) {
399 ui64 d;
400 d = *vlcp->data--; // read one byte and move read pointer
401 //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
402 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
403 vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
404 vlcp->bits += d_bits;
405 vlcp->unstuff = d > 0x8F; // for next byte
406 }
407 vlcp->size -= tnum;
408 rev_read(vlcp); // read another 32 buts
409 }
410
411 //************************************************************************/
418 static inline
420 {
421 if (vlcp->bits < 32) // if there are less then 32 bits, read more
422 {
423 rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
424 if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
425 rev_read(vlcp); // read another 32
426 }
427 return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
428 }
429
430 //************************************************************************/
436 static inline
437 ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
438 {
439 assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
440 vlcp->tmp >>= num_bits; // remove bits
441 vlcp->bits -= num_bits; // decrement the number of bits
442 return (ui32)vlcp->tmp;
443 }
444
445 //************************************************************************/
456 static inline
457 void rev_read_mrp(rev_struct *mrp)
458 {
459 //process 4 bytes at a time
460 if (mrp->bits > 32)
461 return;
462 ui32 val = 0;
463 if (mrp->size > 3) // If there are 3 byte or more
464 { // (mrp->data - 3) move pointer back to read 32 bits at once
465 val = *(ui32*)(mrp->data - 3); // read 32 bits
466 mrp->data -= 4; // move back pointer
467 mrp->size -= 4; // reduce count
468 }
469 else if (mrp->size > 0)
470 {
471 int i = 24;
472 while (mrp->size > 0) {
473 ui32 v = *mrp->data--; // read one byte at a time
474 val |= (v << i); // put byte in its correct location
475 --mrp->size;
476 i -= 8;
477 }
478 }
479
480 //accumulate in tmp, and keep count in bits
481 ui32 bits, tmp = val >> 24;
482
483 //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
484 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
485 bool unstuff = (val >> 24) > 0x8F;
486
487 //process the next byte
488 tmp |= ((val >> 16) & 0xFF) << bits;
489 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
490 unstuff = ((val >> 16) & 0xFF) > 0x8F;
491
492 tmp |= ((val >> 8) & 0xFF) << bits;
493 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
494 unstuff = ((val >> 8) & 0xFF) > 0x8F;
495
496 tmp |= (val & 0xFF) << bits;
497 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
498 unstuff = (val & 0xFF) > 0x8F;
499
500 mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
501 mrp->bits += bits;
502 mrp->unstuff = unstuff; // next byte
503 }
504
505 //************************************************************************/
520 static inline
521 void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
522 {
523 mrp->data = data + lcup + len2 - 1;
524 mrp->size = len2;
525 mrp->unstuff = true;
526 mrp->bits = 0;
527 mrp->tmp = 0;
528
529 //This code is designed for an architecture that read address should
530 // align to the read size (address multiple of 4 if read size is 4)
531 //These few lines take care of the case where data is not at a multiple
532 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
533 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
534 for (int i = 0; i < num; ++i) {
535 ui64 d;
536 //read a byte, 0 if no more data
537 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
538 //check if unstuffing is needed
539 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
540 mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
541 mrp->bits += d_bits;
542 mrp->unstuff = d > 0x8F; // for next byte
543 }
544 rev_read_mrp(mrp);
545 }
546
547 //************************************************************************/
554 static inline
556 {
557 if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
558 {
559 rev_read_mrp(mrp); // read 30-32 bits from mrp
560 if (mrp->bits < 32) // if there is a space of 32 bits
561 rev_read_mrp(mrp); // read more
562 }
563 return (ui32)mrp->tmp; // return the head of mrp->tmp
564 }
565
566 //************************************************************************/
572 inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
573 {
574 assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
575 mrp->tmp >>= num_bits; // discard the lowest num_bits bits
576 mrp->bits -= num_bits;
577 return (ui32)mrp->tmp; // return data after consumption
578 }
579
580 //************************************************************************/
584 struct frwd_struct_ssse3 {
585 const ui8* data;
586 ui8 tmp[48];
587 ui32 bits;
588 ui32 unstuff;
589 int size;
590 };
591
592 //************************************************************************/
610 template<int X>
611 static inline
612 void frwd_read(frwd_struct_ssse3 *msp)
613 {
614 assert(msp->bits <= 128);
615
616 __m128i offset, val, validity, all_xff;
617 val = _mm_loadu_si128((__m128i*)msp->data);
618 int bytes = msp->size >= 16 ? 16 : msp->size;
619 validity = _mm_set1_epi8((char)bytes);
620 msp->data += bytes;
621 msp->size -= bytes;
622 int bits = 128;
623 offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
624 validity = _mm_cmpgt_epi8(validity, offset);
625 all_xff = _mm_set1_epi8(-1);
626 if (X == 0xFF) // the compiler should remove this if statement
627 {
628 __m128i t = _mm_xor_si128(validity, all_xff); // complement
629 val = _mm_or_si128(t, val); // fill with 0xFF
630 }
631 else if (X == 0)
632 val = _mm_and_si128(validity, val); // fill with zeros
633 else
634 assert(0);
635
636 __m128i ff_bytes;
637 ff_bytes = _mm_cmpeq_epi8(val, all_xff);
638 ff_bytes = _mm_and_si128(ff_bytes, validity);
639 ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes);
640 flags <<= 1; // unstuff following byte
641 ui32 next_unstuff = flags >> 16;
642 flags |= msp->unstuff;
643 flags &= 0xFFFF;
644 while (flags)
645 { // bit unstuffing occurs on average once every 256 bytes
646 // therefore it is not an issue if it is a bit slow
647 // here we process 16 bytes
648 --bits; // consuming one stuffing bit
649
650 ui32 loc = 31 - count_leading_zeros(flags);
651 flags ^= 1 << loc;
652
653 __m128i m, t, c;
654 t = _mm_set1_epi8((char)loc);
655 m = _mm_cmpgt_epi8(offset, t);
656
657 t = _mm_and_si128(m, val); // keep bits at locations larger than loc
658 c = _mm_srli_epi64(t, 1); // 1 bits left
659 t = _mm_srli_si128(t, 8); // 8 bytes left
660 t = _mm_slli_epi64(t, 63); // keep the MSB only
661 t = _mm_or_si128(t, c); // combine the above 3 steps
662
663 val = _mm_or_si128(t, _mm_andnot_si128(m, val));
664 }
665
666 // combine with earlier data
667 assert(msp->bits >= 0 && msp->bits <= 128);
668 int cur_bytes = msp->bits >> 3;
669 int cur_bits = msp->bits & 7;
670 __m128i b1, b2;
671 b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
672 b2 = _mm_slli_si128(val, 8); // 8 bytes right
673 b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
674 b1 = _mm_or_si128(b1, b2);
675 b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
676 b2 = _mm_or_si128(b1, b2);
677 _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
678
679 int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
680 cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up
681 int upper = _mm_extract_epi16(val, 7);
682 upper >>= consumed_bits - 128 + 16;
683 msp->tmp[cur_bytes] = (ui8)upper; // copy byte
684
685 msp->bits += (ui32)bits;
686 msp->unstuff = next_unstuff; // next unstuff
687 assert(msp->unstuff == 0 || msp->unstuff == 1);
688 }
689
690 //************************************************************************/
699 template<int X>
700 static inline
701 void frwd_init(frwd_struct_ssse3 *msp, const ui8* data, int size)
702 {
703 msp->data = data;
704 _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
705 _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
706 _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
707
708 msp->bits = 0;
709 msp->unstuff = 0;
710 msp->size = size;
711
712 frwd_read<X>(msp); // read 128 bits more
713 }
714
715 //************************************************************************/
721 static inline
722 void frwd_advance(frwd_struct_ssse3 *msp, ui32 num_bits)
723 {
724 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
725 msp->bits -= num_bits;
726
727 __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
728 num_bits &= 63;
729
730 __m128i v0, v1, c0, c1, t;
731 v0 = _mm_loadu_si128(p);
732 v1 = _mm_loadu_si128(p + 1);
733
734 // shift right by num_bits
735 c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
736 t = _mm_srli_si128(v0, 8);
737 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
738 c0 = _mm_or_si128(c0, t);
739 t = _mm_slli_si128(v1, 8);
740 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
741 c0 = _mm_or_si128(c0, t);
742
743 _mm_storeu_si128((__m128i*)msp->tmp, c0);
744
745 c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
746 t = _mm_srli_si128(v1, 8);
747 t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
748 c1 = _mm_or_si128(c1, t);
749
750 _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
751 }
752
753 //************************************************************************/
760 template<int X>
761 static inline
762 __m128i frwd_fetch(frwd_struct_ssse3 *msp)
763 {
764 if (msp->bits <= 128)
765 {
766 frwd_read<X>(msp);
767 if (msp->bits <= 128) //need to test
768 frwd_read<X>(msp);
769 }
770 __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
771 return t;
772 }
773
774 //************************************************************************/
786 template <int N>
787 static inline
788 __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q,
789 frwd_struct_ssse3* magsgn, ui32 p, __m128i& vn)
790 {
791 __m128i w0; // workers
792 __m128i insig; // lanes hold FF's if samples are insignificant
793 __m128i flags; // lanes hold e_k, e_1, and rho
794 __m128i row; // decoded row
795
796 row = _mm_setzero_si128();
797 w0 = _mm_shuffle_epi32(inf_u_q, _MM_SHUFFLE(N, N, N, N));
798 // we keeps e_k, e_1, and rho in w2
799 flags = _mm_and_si128(w0, _mm_set_epi32(0x8880, 0x4440, 0x2220, 0x1110));
800 insig = _mm_cmpeq_epi32(flags, _mm_setzero_si128());
801 if (_mm_movemask_epi8(insig) != 0xFFFF) //are all insignificant?
802 {
803 U_q = _mm_shuffle_epi32(U_q, _MM_SHUFFLE(N, N, N, N));
804 flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,1,2,2,4,4,8,8));
805 __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
806
807 // U_q holds U_q for this quad
808 // flags has e_k, e_1, and rho such that e_k is sitting in the
809 // 0x8000, e_1 in 0x800, and rho in 0x80
810
811 // next e_k and m_n
812 __m128i m_n;
813 w0 = _mm_srli_epi32(flags, 15); // e_k
814 m_n = _mm_sub_epi32(U_q, w0);
815 m_n = _mm_andnot_si128(insig, m_n);
816
817 // find cumulative sums
818 // to find at which bit in ms_vec the sample starts
819 __m128i inc_sum = m_n; // inclusive scan
820 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
821 inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
822 int total_mn = _mm_extract_epi16(inc_sum, 6);
823 __m128i ex_sum = _mm_bslli_si128(inc_sum, 4); // exclusive scan
824
825 // find the starting byte and starting bit
826 __m128i byte_idx = _mm_srli_epi32(ex_sum, 3);
827 __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi32(7));
828 byte_idx = _mm_shuffle_epi8(byte_idx,
829 _mm_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
830 byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x03020100));
831 __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
832 byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x01010101));
833 __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
834
835 // shift samples values to correct location
836 bit_idx = _mm_or_si128(bit_idx, _mm_slli_epi32(bit_idx, 16));
837 __m128i bit_shift = _mm_shuffle_epi8(
838 _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
839 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
840 bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
841 d0 = _mm_mullo_epi16(d0, bit_shift);
842 d0 = _mm_srli_epi16(d0, 8); // we should have 8 bits in the LSB
843 d1 = _mm_mullo_epi16(d1, bit_shift);
844 d1 = _mm_and_si128(d1, _mm_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
845 d0 = _mm_or_si128(d0, d1);
846
847 // find location of e_k and mask
848 __m128i shift;
849 __m128i ones = _mm_set1_epi32(1);
850 __m128i twos = _mm_set1_epi32(2);
851 __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
852 U_q_m1 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
853 w0 = _mm_sub_epi32(twos, w0);
854 shift = _mm_sll_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
855 ms_vec = _mm_and_si128(d0, _mm_sub_epi32(shift, ones));
856
857 // next e_1
858 w0 = _mm_and_si128(flags, _mm_set1_epi32(0x800));
859 w0 = _mm_cmpeq_epi32(w0, _mm_setzero_si128());
860 w0 = _mm_andnot_si128(w0, shift); // e_1 in correct position
861 ms_vec = _mm_or_si128(ms_vec, w0); // e_1
862 w0 = _mm_slli_epi32(ms_vec, 31); // sign
863 ms_vec = _mm_or_si128(ms_vec, ones); // bin center
864 __m128i tvn = ms_vec;
865 ms_vec = _mm_add_epi32(ms_vec, twos);// + 2
866 ms_vec = _mm_slli_epi32(ms_vec, (si32)p - 1);
867 ms_vec = _mm_or_si128(ms_vec, w0); // sign
868 row = _mm_andnot_si128(insig, ms_vec); // significant only
869
870 ms_vec = _mm_andnot_si128(insig, tvn); // significant only
871 if (N == 0) // the compiler should remove one
872 tvn = _mm_shuffle_epi8(ms_vec,
873 _mm_set_epi32(-1, -1, 0x0F0E0D0C, 0x07060504));
874 else if (N == 1)
875 tvn = _mm_shuffle_epi8(ms_vec,
876 _mm_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1));
877 else
878 assert(0);
879 vn = _mm_or_si128(vn, tvn);
880
881 if (total_mn)
882 frwd_advance(magsgn, (ui32)total_mn);
883 }
884 return row;
885 }
886
887 //************************************************************************/
897 static inline
898 __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q,
899 frwd_struct_ssse3* magsgn, ui32 p, __m128i& vn)
900 {
901 __m128i w0; // workers
902 __m128i insig; // lanes hold FF's if samples are insignificant
903 __m128i flags; // lanes hold e_k, e_1, and rho
904 __m128i row; // decoded row
905
906 row = _mm_setzero_si128();
907 w0 = _mm_shuffle_epi8(inf_u_q,
908 _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
909 0x0100, 0x0100, 0x0100, 0x0100));
910 // we keeps e_k, e_1, and rho in w2
911 flags = _mm_and_si128(w0,
912 _mm_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
913 (si16)0x8880, 0x4440, 0x2220, 0x1110));
914 insig = _mm_cmpeq_epi16(flags, _mm_setzero_si128());
915 if (_mm_movemask_epi8(insig) != 0xFFFF) //are all insignificant?
916 {
917 U_q = _mm_shuffle_epi8(U_q,
918 _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
919 0x0100, 0x0100, 0x0100, 0x0100));
920 flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,2,4,8,1,2,4,8));
921 __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
922
923 // U_q holds U_q for this quad
924 // flags has e_k, e_1, and rho such that e_k is sitting in the
925 // 0x8000, e_1 in 0x800, and rho in 0x80
926
927 // next e_k and m_n
928 __m128i m_n;
929 w0 = _mm_srli_epi16(flags, 15); // e_k
930 m_n = _mm_sub_epi16(U_q, w0);
931 m_n = _mm_andnot_si128(insig, m_n);
932
933 // find cumulative sums
934 // to find at which bit in ms_vec the sample starts
935 __m128i inc_sum = m_n; // inclusive scan
936 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 2));
937 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 4));
938 inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 8));
939 int total_mn = _mm_extract_epi16(inc_sum, 7);
940 __m128i ex_sum = _mm_bslli_si128(inc_sum, 2); // exclusive scan
941
942 // find the starting byte and starting bit
943 __m128i byte_idx = _mm_srli_epi16(ex_sum, 3);
944 __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi16(7));
945 byte_idx = _mm_shuffle_epi8(byte_idx,
946 _mm_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
947 0x0606, 0x0404, 0x0202, 0x0000));
948 byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0100));
949 __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
950 byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0101));
951 __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
952
953 // shift samples values to correct location
954 __m128i bit_shift = _mm_shuffle_epi8(
955 _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
956 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
957 bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
958 d0 = _mm_mullo_epi16(d0, bit_shift);
959 d0 = _mm_srli_epi16(d0, 8); // we should have 8 bits in the LSB
960 d1 = _mm_mullo_epi16(d1, bit_shift);
961 d1 = _mm_and_si128(d1, _mm_set1_epi16((si16)0xFF00)); // 8 in MSB
962 d0 = _mm_or_si128(d0, d1);
963
964 // find location of e_k and mask
965 __m128i shift, t0, t1, Uq0, Uq1;
966 __m128i ones = _mm_set1_epi16(1);
967 __m128i twos = _mm_set1_epi16(2);
968 __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
969 Uq0 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
970 Uq1 = _mm_bsrli_si128(U_q_m1, 14);
971 w0 = _mm_sub_epi16(twos, w0);
972 t0 = _mm_and_si128(w0, _mm_set_epi64x(0, -1));
973 t1 = _mm_and_si128(w0, _mm_set_epi64x(-1, 0));
974 t0 = _mm_sll_epi16(t0, Uq0);
975 t1 = _mm_sll_epi16(t1, Uq1);
976 shift = _mm_or_si128(t0, t1);
977 ms_vec = _mm_and_si128(d0, _mm_sub_epi16(shift, ones));
978
979 // next e_1
980 w0 = _mm_and_si128(flags, _mm_set1_epi16(0x800));
981 w0 = _mm_cmpeq_epi16(w0, _mm_setzero_si128());
982 w0 = _mm_andnot_si128(w0, shift); // e_1 in correct position
983 ms_vec = _mm_or_si128(ms_vec, w0); // e_1
984 w0 = _mm_slli_epi16(ms_vec, 15); // sign
985 ms_vec = _mm_or_si128(ms_vec, ones); // bin center
986 __m128i tvn = ms_vec;
987 ms_vec = _mm_add_epi16(ms_vec, twos);// + 2
988 ms_vec = _mm_slli_epi16(ms_vec, (si32)p - 1);
989 ms_vec = _mm_or_si128(ms_vec, w0); // sign
990 row = _mm_andnot_si128(insig, ms_vec); // significant only
991
992 ms_vec = _mm_andnot_si128(insig, tvn); // significant only
993 w0 = _mm_shuffle_epi8(ms_vec,
994 _mm_set_epi16(-1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
995 vn = _mm_or_si128(vn, w0);
996 w0 = _mm_shuffle_epi8(ms_vec,
997 _mm_set_epi16(-1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
998 vn = _mm_or_si128(vn, w0);
999
1000 if (total_mn)
1001 frwd_advance(magsgn, (ui32)total_mn);
1002 }
1003 return row;
1004 }
1005
1006
1007 //************************************************************************/
1024 bool ojph_decode_codeblock_ssse3(ui8* coded_data, ui32* decoded_data,
1025 ui32 missing_msbs, ui32 num_passes,
1026 ui32 lengths1, ui32 lengths2,
1027 ui32 width, ui32 height, ui32 stride,
1028 bool stripe_causal)
1029 {
1030 static bool insufficient_precision = false;
1031 static bool modify_code = false;
1032 static bool truncate_spp_mrp = false;
1033
1034 if (num_passes > 1 && lengths2 == 0)
1035 {
1036 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1037 "one coding pass, but zero length for "
1038 "2nd and potential 3rd pass.");
1039 num_passes = 1;
1040 }
1041
1042 if (num_passes > 3)
1043 {
1044 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1045 "This codeblocks has %d passes.",
1046 num_passes);
1047 return false;
1048 }
1049
1050 if (missing_msbs > 30) // p < 0
1051 {
1052 if (insufficient_precision == false)
1053 {
1054 insufficient_precision = true;
1055 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1056 "codeblock. This message will not be "
1057 "displayed again.");
1058 }
1059 return false;
1060 }
1061 else if (missing_msbs == 30) // p == 0
1062 { // not enough precision to decode and set the bin center to 1
1063 if (modify_code == false) {
1064 modify_code = true;
1065 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1066 "pass. The code can be modified to support "
1067 "this case. This message will not be "
1068 "displayed again.");
1069 }
1070 return false; // 32 bits are not enough to decode this
1071 }
1072 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1073 {
1074 if (num_passes > 1) {
1075 num_passes = 1;
1076 if (truncate_spp_mrp == false) {
1077 truncate_spp_mrp = true;
1078 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1079 "nor MagRef passes; both will be skipped. "
1080 "This message will not be displayed "
1081 "again.");
1082 }
1083 }
1084 }
1085 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1086 // There is a way to handle the case of p == 0, but a different path
1087 // is required
1088
1089 if (lengths1 < 2)
1090 {
1091 OJPH_WARN(0x00010006, "Wrong codeblock length.");
1092 return false;
1093 }
1094
1095 // read scup and fix the bytes there
1096 int lcup, scup;
1097 lcup = (int)lengths1; // length of CUP
1098 //scup is the length of MEL + VLC
1099 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1100 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1101 return false;
1102
1103 // The temporary storage scratch holds two types of data in an
1104 // interleaved fashion. The interleaving allows us to use one
1105 // memory pointer.
1106 // We have one entry for a decoded VLC code, and one entry for UVLC.
1107 // Entries are 16 bits each, corresponding to one quad,
1108 // but since we want to use XMM registers of the SSE family
1109 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1110 // the width is no smaller than 16 bytes (or 8 entries), and the
1111 // height is 512 quads
1112 // Each VLC entry contains, in the following order, starting
1113 // from MSB
1114 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1115 // Each entry in UVLC contains u_q
1116 // One extra row to handle the case of SPP propagating downwards
1117 // when codeblock width is 4
1118 ui16 scratch[8 * 513] = {0}; // 8+ kB
1119
1120 // We need an extra two entries (one inf and one u_q) beyond
1121 // the last column.
1122 // If the block width is 4 (2 quads), then we use sstr of 8
1123 // (enough for 4 quads). If width is 8 (4 quads) we use
1124 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1125 // quads), we use 24 (enough for 12 quads).
1126 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1127
1128 assert((stride & 0x3) == 0);
1129
1130 ui32 mmsbp2 = missing_msbs + 2;
1131
1132 // The cleanup pass is decoded in two steps; in step one,
1133 // the VLC and MEL segments are decoded, generating a record that
1134 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1135 // This information should be sufficient for the next step.
1136 // In step 2, we decode the MagSgn segment.
1137
1138 // step 1 decoding VLC and MEL segments
1139 {
1140 // init structures
1141 dec_mel_st mel;
1142 mel_init(&mel, coded_data, lcup, scup);
1143 rev_struct vlc;
1144 rev_init(&vlc, coded_data, lcup, scup);
1145
1146 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1147 // data represented as runs of 0 events
1148 // See mel_decode description
1149
1150 ui32 vlc_val;
1151 ui32 c_q = 0;
1152 ui16 *sp = scratch;
1153 //initial quad row
1154 for (ui32 x = 0; x < width; sp += 4)
1155 {
1156 // decode VLC
1158
1159 // first quad
1160 vlc_val = rev_fetch(&vlc);
1161
1162 //decode VLC using the context c_q and the head of VLC bitstream
1163 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1164
1165 // if context is zero, use one MEL event
1166 if (c_q == 0) //zero context
1167 {
1168 run -= 2; //subtract 2, since events number if multiplied by 2
1169
1170 // Is the run terminated in 1? if so, use decoded VLC code,
1171 // otherwise, discard decoded data, since we will decoded again
1172 // using a different context
1173 t0 = (run == -1) ? t0 : 0;
1174
1175 // is run -1 or -2? this means a run has been consumed
1176 if (run < 0)
1177 run = mel_get_run(&mel); // get another run
1178 }
1179 //run -= (c_q == 0) ? 2 : 0;
1180 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1181 //if (run < 0)
1182 // run = mel_get_run(&mel); // get another run
1183 sp[0] = t0;
1184 x += 2;
1185
1186 // prepare context for the next quad; eqn. 1 in ITU T.814
1187 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1188
1189 //remove data from vlc stream (0 bits are removed if vlc is not used)
1190 vlc_val = rev_advance(&vlc, t0 & 0x7);
1191
1192 //second quad
1193 ui16 t1 = 0;
1194
1195 //decode VLC using the context c_q and the head of VLC bitstream
1196 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1197
1198 // if context is zero, use one MEL event
1199 if (c_q == 0 && x < width) //zero context
1200 {
1201 run -= 2; //subtract 2, since events number if multiplied by 2
1202
1203 // if event is 0, discard decoded t1
1204 t1 = (run == -1) ? t1 : 0;
1205
1206 if (run < 0) // have we consumed all events in a run
1207 run = mel_get_run(&mel); // if yes, then get another run
1208 }
1209 t1 = x < width ? t1 : 0;
1210 //run -= (c_q == 0 && x < width) ? 2 : 0;
1211 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1212 //if (run < 0)
1213 // run = mel_get_run(&mel); // get another run
1214 sp[2] = t1;
1215 x += 2;
1216
1217 //prepare context for the next quad, eqn. 1 in ITU T.814
1218 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1219
1220 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1221 vlc_val = rev_advance(&vlc, t1 & 0x7);
1222
1223 // decode u
1225 // uvlc_mode is made up of u_offset bits from the quad pair
1226 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1227 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1228 { // the MEL run of events
1229 run -= 2; //subtract 2, since events number if multiplied by 2
1230
1231 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1232 // is 0x40
1233
1234 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1235 run = mel_get_run(&mel);
1236 }
1237 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1238 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1239 //if (run < 0)
1240 // run = mel_get_run(&mel); // get another run
1241
1242 //decode uvlc_mode to get u for both quads
1243 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1244 //remove total prefix length
1245 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1246 uvlc_entry >>= 3;
1247 //extract suffixes for quad 0 and 1
1248 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1249 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1250 vlc_val = rev_advance(&vlc, len);
1251 uvlc_entry >>= 4;
1252 // quad 0 length
1253 len = uvlc_entry & 0x7; // quad 0 suffix length
1254 uvlc_entry >>= 3;
1255 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1256 sp[1] = u_q;
1257 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1258 sp[3] = u_q;
1259 }
1260 sp[0] = sp[1] = 0;
1261
1262 //non initial quad rows
1263 for (ui32 y = 2; y < height; y += 2)
1264 {
1265 c_q = 0; // context
1266 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1267
1268 for (ui32 x = 0; x < width; sp += 4)
1269 {
1270 // decode VLC
1272
1273 // sigma_q (n, ne, nf)
1274 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1275 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1276
1277 // first quad
1278 vlc_val = rev_fetch(&vlc);
1279
1280 //decode VLC using the context c_q and the head of VLC bitstream
1281 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1282
1283 // if context is zero, use one MEL event
1284 if (c_q == 0) //zero context
1285 {
1286 run -= 2; //subtract 2, since events number is multiplied by 2
1287
1288 // Is the run terminated in 1? if so, use decoded VLC code,
1289 // otherwise, discard decoded data, since we will decoded again
1290 // using a different context
1291 t0 = (run == -1) ? t0 : 0;
1292
1293 // is run -1 or -2? this means a run has been consumed
1294 if (run < 0)
1295 run = mel_get_run(&mel); // get another run
1296 }
1297 //run -= (c_q == 0) ? 2 : 0;
1298 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1299 //if (run < 0)
1300 // run = mel_get_run(&mel); // get another run
1301 sp[0] = t0;
1302 x += 2;
1303
1304 // prepare context for the next quad; eqn. 2 in ITU T.814
1305 // sigma_q (w, sw)
1306 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1307 // sigma_q (nw)
1308 c_q |= sp[0 - (si32)sstr] & 0x80;
1309 // sigma_q (n, ne, nf)
1310 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1311 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1312
1313 //remove data from vlc stream (0 bits are removed if vlc is unused)
1314 vlc_val = rev_advance(&vlc, t0 & 0x7);
1315
1316 //second quad
1317 ui16 t1 = 0;
1318
1319 //decode VLC using the context c_q and the head of VLC bitstream
1320 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1321
1322 // if context is zero, use one MEL event
1323 if (c_q == 0 && x < width) //zero context
1324 {
1325 run -= 2; //subtract 2, since events number if multiplied by 2
1326
1327 // if event is 0, discard decoded t1
1328 t1 = (run == -1) ? t1 : 0;
1329
1330 if (run < 0) // have we consumed all events in a run
1331 run = mel_get_run(&mel); // if yes, then get another run
1332 }
1333 t1 = x < width ? t1 : 0;
1334 //run -= (c_q == 0 && x < width) ? 2 : 0;
1335 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1336 //if (run < 0)
1337 // run = mel_get_run(&mel); // get another run
1338 sp[2] = t1;
1339 x += 2;
1340
1341 // partial c_q, will be completed when we process the next quad
1342 // sigma_q (w, sw)
1343 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1344 // sigma_q (nw)
1345 c_q |= sp[2 - (si32)sstr] & 0x80;
1346
1347 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1348 vlc_val = rev_advance(&vlc, t1 & 0x7);
1349
1350 // decode u
1352 // uvlc_mode is made up of u_offset bits from the quad pair
1353 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1354 ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1355 //remove total prefix length
1356 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1357 uvlc_entry >>= 3;
1358 //extract suffixes for quad 0 and 1
1359 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1360 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1361 vlc_val = rev_advance(&vlc, len);
1362 uvlc_entry >>= 4;
1363 // quad 0 length
1364 len = uvlc_entry & 0x7; // quad 0 suffix length
1365 uvlc_entry >>= 3;
1366 ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1367 sp[1] = u_q;
1368 u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1369 sp[3] = u_q;
1370 }
1371 sp[0] = sp[1] = 0;
1372 }
1373 }
1374
1375 // step2 we decode magsgn
1376 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1377 // The 32 bit path decode 16 bits data, for which one would think
1378 // 16 bits are enough, because we want to put in the center of the
1379 // bin.
1380 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1381 // no bitplanes are missing, then we can decoding using the 16 bit
1382 // path, but we are not doing this here.
1383 if (mmsbp2 >= 16)
1384 {
1385 // We allocate a scratch row for storing v_n values.
1386 // We have 512 quads horizontally.
1387 // We may go beyond the last entry by up to 4 entries.
1388 // Here we allocate additional 8 entries.
1389 // There are two rows in this structure, the bottom
1390 // row is used to store processed entries.
1391 const int v_n_size = 512 + 8;
1392 ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1393
1394 frwd_struct_ssse3 magsgn;
1395 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1396
1397 {
1398 ui16 *sp = scratch;
1399 ui32 *vp = v_n_scratch;
1400 ui32 *dp = decoded_data;
1401 vp[0] = 2; // for easy calculation of emax
1402
1403 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1404 {
1405 //here we process two quads
1406 __m128i w0, w1; // workers
1407 __m128i inf_u_q, U_q;
1408 // determine U_q
1409 {
1410 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1411 U_q = _mm_srli_epi32(inf_u_q, 16);
1412
1413 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1414 int i = _mm_movemask_epi8(w0);
1415 if (i & 0xFF) // only the lower two U_q
1416 return false;
1417 }
1418
1419 __m128i vn = _mm_set1_epi32(2);
1420 __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1421 __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1422 w0 = _mm_loadu_si128((__m128i*)vp);
1423 w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1424 w0 = _mm_or_si128(w0, vn);
1425 _mm_storeu_si128((__m128i*)vp, w0);
1426
1427 //interleave in ssse3 style
1428 w0 = _mm_unpacklo_epi32(row0, row1);
1429 w1 = _mm_unpackhi_epi32(row0, row1);
1430 row0 = _mm_unpacklo_epi32(w0, w1);
1431 row1 = _mm_unpackhi_epi32(w0, w1);
1432 _mm_store_si128((__m128i*)dp, row0);
1433 _mm_store_si128((__m128i*)(dp + stride), row1);
1434 }
1435 }
1436
1437 for (ui32 y = 2; y < height; y += 2)
1438 {
1439 {
1440 // perform 31 - count_leading_zeros(*vp) here
1441 ui32 *vp = v_n_scratch;
1442 const __m128i lut_lo = _mm_set_epi8(
1443 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 31
1444 );
1445 const __m128i lut_hi = _mm_set_epi8(
1446 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 31
1447 );
1448 const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1449 const __m128i byte_offset8 = _mm_set1_epi16(8);
1450 const __m128i byte_offset16 = _mm_set1_epi16(16);
1451 const __m128i cc = _mm_set1_epi32(31);
1452 for (ui32 x = 0; x <= width; x += 8, vp += 4)
1453 {
1454 __m128i v, t; // workers
1455 v = _mm_loadu_si128((__m128i*)vp);
1456
1457 t = _mm_and_si128(nibble_mask, v);
1458 v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1459 t = _mm_shuffle_epi8(lut_lo, t);
1460 v = _mm_shuffle_epi8(lut_hi, v);
1461 v = _mm_min_epu8(v, t);
1462
1463 t = _mm_srli_epi16(v, 8);
1464 v = _mm_or_si128(v, byte_offset8);
1465 v = _mm_min_epu8(v, t);
1466
1467 t = _mm_srli_epi32(v, 16);
1468 v = _mm_or_si128(v, byte_offset16);
1469 v = _mm_min_epu8(v, t);
1470
1471 v = _mm_sub_epi16(cc, v);
1472 _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1473 }
1474 }
1475
1476 ui32 *vp = v_n_scratch;
1477 ui16 *sp = scratch + (y >> 1) * sstr;
1478 ui32 *dp = decoded_data + y * stride;
1479 vp[0] = 2; // for easy calculation of emax
1480
1481 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1482 {
1483 //process two quads
1484 __m128i w0, w1; // workers
1485 __m128i inf_u_q, U_q;
1486 // determine U_q
1487 {
1488 __m128i gamma, emax, kappa, u_q; // needed locally
1489
1490 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1491 gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1492 w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1493 gamma = _mm_and_si128(gamma, w0);
1494 gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1495
1496 emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1497 w0 = _mm_bsrli_si128(emax, 4);
1498 emax = _mm_max_epi16(w0, emax); // no max_epi32 in ssse3
1499 emax = _mm_andnot_si128(gamma, emax);
1500
1501 kappa = _mm_set1_epi32(1);
1502 kappa = _mm_max_epi16(emax, kappa); // no max_epi32 in ssse3
1503
1504 u_q = _mm_srli_epi32(inf_u_q, 16);
1505 U_q = _mm_add_epi32(u_q, kappa);
1506
1507 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1508 int i = _mm_movemask_epi8(w0);
1509 if (i & 0xFF) // only the lower two U_q
1510 return false;
1511 }
1512
1513 __m128i vn = _mm_set1_epi32(2);
1514 __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1515 __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1516 w0 = _mm_loadu_si128((__m128i*)vp);
1517 w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1518 w0 = _mm_or_si128(w0, vn);
1519 _mm_storeu_si128((__m128i*)vp, w0);
1520
1521 //interleave in ssse3 style
1522 w0 = _mm_unpacklo_epi32(row0, row1);
1523 w1 = _mm_unpackhi_epi32(row0, row1);
1524 row0 = _mm_unpacklo_epi32(w0, w1);
1525 row1 = _mm_unpackhi_epi32(w0, w1);
1526 _mm_store_si128((__m128i*)dp, row0);
1527 _mm_store_si128((__m128i*)(dp + stride), row1);
1528 }
1529 }
1530 }
1531 else
1532 {
1533 // reduce bitplane by 16 because we now have 16 bits instead of 32
1534 p -= 16;
1535
1536 // We allocate a scratch row for storing v_n values.
1537 // We have 512 quads horizontally.
1538 // We may go beyond the last entry by up to 8 entries.
1539 // Therefore we allocate additional 8 entries.
1540 // There are two rows in this structure, the bottom
1541 // row is used to store processed entries.
1542 const int v_n_size = 512 + 8;
1543 ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
1544
1545 frwd_struct_ssse3 magsgn;
1546 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1547
1548 {
1549 ui16 *sp = scratch;
1550 ui16 *vp = v_n_scratch;
1551 ui32 *dp = decoded_data;
1552 vp[0] = 2; // for easy calculation of emax
1553
1554 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1555 {
1556 //here we process two quads
1557 __m128i w0, w1; // workers
1558 __m128i inf_u_q, U_q;
1559 // determine U_q
1560 {
1561 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1562 U_q = _mm_srli_epi32(inf_u_q, 16);
1563
1564 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1565 int i = _mm_movemask_epi8(w0);
1566 if (i & 0xFF) // only the lower two U_q
1567 return false;
1568 }
1569
1570 __m128i vn = _mm_set1_epi16(2);
1571 __m128i row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1572 w0 = _mm_loadu_si128((__m128i*)vp);
1573 w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1574 w0 = _mm_or_si128(w0, vn);
1575 _mm_storeu_si128((__m128i*)vp, w0);
1576
1577 //interleave in ssse3 style
1578 w0 = _mm_shuffle_epi8(row,
1579 _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1580 0x0504, -1, 0x0100, -1));
1581 _mm_store_si128((__m128i*)dp, w0);
1582 w1 = _mm_shuffle_epi8(row,
1583 _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1584 0x0706, -1, 0x0302, -1));
1585 _mm_store_si128((__m128i*)(dp + stride), w1);
1586 }
1587 }
1588
1589 for (ui32 y = 2; y < height; y += 2)
1590 {
1591 {
1592 // perform 15 - count_leading_zeros(*vp) here
1593 ui16 *vp = v_n_scratch;
1594 const __m128i lut_lo = _mm_set_epi8(
1595 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 15
1596 );
1597 const __m128i lut_hi = _mm_set_epi8(
1598 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 15
1599 );
1600 const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1601 const __m128i byte_offset8 = _mm_set1_epi16(8);
1602 const __m128i cc = _mm_set1_epi16(15);
1603 for (ui32 x = 0; x <= width; x += 16, vp += 8)
1604 {
1605 __m128i v, t; // workers
1606 v = _mm_loadu_si128((__m128i*)vp);
1607
1608 t = _mm_and_si128(nibble_mask, v);
1609 v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1610 t = _mm_shuffle_epi8(lut_lo, t);
1611 v = _mm_shuffle_epi8(lut_hi, v);
1612 v = _mm_min_epu8(v, t);
1613
1614 t = _mm_srli_epi16(v, 8);
1615 v = _mm_or_si128(v, byte_offset8);
1616 v = _mm_min_epu8(v, t);
1617
1618 v = _mm_sub_epi16(cc, v);
1619 _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1620 }
1621 }
1622
1623 ui16 *vp = v_n_scratch;
1624 ui16 *sp = scratch + (y >> 1) * sstr;
1625 ui32 *dp = decoded_data + y * stride;
1626 vp[0] = 2; // for easy calculation of emax
1627
1628 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1629 {
1630 //process two quads
1631 __m128i w0, w1; // workers
1632 __m128i inf_u_q, U_q;
1633 // determine U_q
1634 {
1635 __m128i gamma, emax, kappa, u_q; // needed locally
1636
1637 inf_u_q = _mm_loadu_si128((__m128i*)sp);
1638 gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1639 w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1640 gamma = _mm_and_si128(gamma, w0);
1641 gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1642
1643 emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1644 w0 = _mm_bsrli_si128(emax, 2);
1645 emax = _mm_max_epi16(w0, emax); // no max_epi32 in ssse3
1646 emax = _mm_shuffle_epi8(emax,
1647 _mm_set_epi16(-1, 0x0706, -1, 0x0504,
1648 -1, 0x0302, -1, 0x0100));
1649 emax = _mm_andnot_si128(gamma, emax);
1650
1651 kappa = _mm_set1_epi32(1);
1652 kappa = _mm_max_epi16(emax, kappa); // no max_epi32 in ssse3
1653
1654 u_q = _mm_srli_epi32(inf_u_q, 16);
1655 U_q = _mm_add_epi32(u_q, kappa);
1656
1657 w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1658 int i = _mm_movemask_epi8(w0);
1659 if (i & 0xFF) // only the lower two U_q
1660 return false;
1661 }
1662
1663 __m128i vn = _mm_set1_epi16(2);
1664 __m128i row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1665 w0 = _mm_loadu_si128((__m128i*)vp);
1666 w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1667 w0 = _mm_or_si128(w0, vn);
1668 _mm_storeu_si128((__m128i*)vp, w0);
1669
1670 w0 = _mm_shuffle_epi8(row,
1671 _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1672 0x0504, -1, 0x0100, -1));
1673 _mm_store_si128((__m128i*)dp, w0);
1674 w1 = _mm_shuffle_epi8(row,
1675 _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1676 0x0706, -1, 0x0302, -1));
1677 _mm_store_si128((__m128i*)(dp + stride), w1);
1678 }
1679 }
1680
1681 // increase bitplane back by 16 because we need to process 32 bits
1682 p += 16;
1683 }
1684
1685 if (num_passes > 1)
1686 {
1687 // We use scratch again, we can divide it into multiple regions
1688 // sigma holds all the significant samples, and it cannot
1689 // be modified after it is set. it will be used during the
1690 // Magnitude Refinement Pass
1691 ui16* const sigma = scratch;
1692
1693 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1694 // ui16 contains 4 columns
1695 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1696
1697 // We re-arrange quad significance, where each 4 consecutive
1698 // bits represent one quad, into column significance, where,
1699 // each 4 consequtive bits represent one column of 4 rows
1700 {
1701 ui32 y;
1702
1703 const __m128i mask_3 = _mm_set1_epi32(0x30);
1704 const __m128i mask_C = _mm_set1_epi32(0xC0);
1705 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1706 for (y = 0; y < height; y += 4)
1707 {
1708 ui16* sp = scratch + (y >> 1) * sstr;
1709 ui16* dp = sigma + (y >> 2) * mstr;
1710 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1711 {
1712 __m128i s0, s1, u3, uC, t0, t1;
1713
1714 s0 = _mm_loadu_si128((__m128i*)(sp));
1715 u3 = _mm_and_si128(s0, mask_3);
1716 u3 = _mm_srli_epi32(u3, 4);
1717 uC = _mm_and_si128(s0, mask_C);
1718 uC = _mm_srli_epi32(uC, 2);
1719 t0 = _mm_or_si128(u3, uC);
1720
1721 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1722 u3 = _mm_and_si128(s1, mask_3);
1723 u3 = _mm_srli_epi32(u3, 2);
1724 uC = _mm_and_si128(s1, mask_C);
1725 t1 = _mm_or_si128(u3, uC);
1726
1727 __m128i r = _mm_or_si128(t0, t1);
1728 r = _mm_shuffle_epi8(r, shuffle_mask);
1729
1730 dp[0] = (ui16)_mm_extract_epi16(r, 0);
1731 dp[1] = (ui16)_mm_extract_epi16(r, 1);
1732 }
1733 dp[0] = 0; // set an extra entry on the right with 0
1734 }
1735 {
1736 // reset one row after the codeblock
1737 ui16* dp = sigma + (y >> 2) * mstr;
1738 __m128i zero = _mm_setzero_si128();
1739 for (ui32 x = 0; x < width; x += 32, dp += 8)
1740 _mm_storeu_si128((__m128i*)dp, zero);
1741 dp[0] = 0; // set an extra entry on the right with 0
1742 }
1743 }
1744
1745 // We perform Significance Propagation Pass here
1746 {
1747 // This stores significance information of the previous
1748 // 4 rows. Significance information in this array includes
1749 // all signicant samples in bitplane p - 1; that is,
1750 // significant samples for bitplane p (discovered during the
1751 // cleanup pass and stored in sigma) and samples that have recently
1752 // became significant (during the SPP) in bitplane p-1.
1753 // We store enough for the widest row, containing 1024 columns,
1754 // which is equivalent to 256 of ui16, since each stores 4 columns.
1755 // We add an extra 8 entries, just in case we need more
1756 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1757
1758 frwd_struct_ssse3 sigprop;
1759 frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1760
1761 for (ui32 y = 0; y < height; y += 4)
1762 {
1763 ui32 pattern = 0xFFFFu; // a pattern needed samples
1764 if (height - y < 4) {
1765 pattern = 0x7777u;
1766 if (height - y < 3) {
1767 pattern = 0x3333u;
1768 if (height - y < 2)
1769 pattern = 0x1111u;
1770 }
1771 }
1772
1773 // prev holds sign. info. for the previous quad, together
1774 // with the rows on top of it and below it.
1775 ui32 prev = 0;
1776 ui16 *prev_sig = prev_row_sig;
1777 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1778 ui32 *dpp = decoded_data + y * stride;
1779 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1780 {
1781 // only rows and columns inside the stripe are included
1782 si32 s = (si32)x + 4 - (si32)width;
1783 s = ojph_max(s, 0);
1784 pattern = pattern >> (s * 4);
1785
1786 // We first find locations that need to be tested (potential
1787 // SPP members); these location will end up in mbr
1788 // In each iteration, we produce 16 bits because cwd can have
1789 // up to 16 bits of significance information, followed by the
1790 // corresponding 16 bits of sign information; therefore, it is
1791 // sufficient to fetch 32 bit data per loop.
1792
1793 // Althougth we are interested in 16 bits only, we load 32 bits.
1794 // For the 16 bits we are producing, we need the next 4 bits --
1795 // We need data for at least 5 columns out of 8.
1796 // Therefore loading 32 bits is easier than loading 16 bits
1797 // twice.
1798 ui32 ps = *(ui32*)prev_sig;
1799 ui32 ns = *(ui32*)(cur_sig + mstr);
1800 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1801 if (!stripe_causal)
1802 u |= (ns & 0x11111111) << 3; // the row below
1803
1804 ui32 cs = *(ui32*)cur_sig;
1805 // vertical integration
1806 ui32 mbr = cs; // this sig. info.
1807 mbr |= (cs & 0x77777777) << 1; //above neighbors
1808 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1809 mbr |= u;
1810 // horizontal integration
1811 ui32 t = mbr;
1812 mbr |= t << 4; // neighbors on the left
1813 mbr |= t >> 4; // neighbors on the right
1814 mbr |= prev >> 12; // significance of previous group
1815
1816 // remove outside samples, and already significant samples
1817 mbr &= pattern;
1818 mbr &= ~cs;
1819
1820 // find samples that become significant during the SPP
1821 ui32 new_sig = mbr;
1822 if (new_sig)
1823 {
1824 __m128i cwd_vec = frwd_fetch<0>(&sigprop);
1825 ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0);
1826
1827 ui32 cnt = 0;
1828 ui32 col_mask = 0xFu;
1829 ui32 inv_sig = ~cs & pattern;
1830 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1831 {
1832 if ((col_mask & new_sig) == 0)
1833 continue;
1834
1835 //scan one column
1836 ui32 sample_mask = 0x1111u & col_mask;
1837 if (new_sig & sample_mask)
1838 {
1839 new_sig &= ~sample_mask;
1840 if (cwd & 1)
1841 {
1842 ui32 t = 0x33u << i;
1843 new_sig |= t & inv_sig;
1844 }
1845 cwd >>= 1; ++cnt;
1846 }
1847
1848 sample_mask <<= 1;
1849 if (new_sig & sample_mask)
1850 {
1851 new_sig &= ~sample_mask;
1852 if (cwd & 1)
1853 {
1854 ui32 t = 0x76u << i;
1855 new_sig |= t & inv_sig;
1856 }
1857 cwd >>= 1; ++cnt;
1858 }
1859
1860 sample_mask <<= 1;
1861 if (new_sig & sample_mask)
1862 {
1863 new_sig &= ~sample_mask;
1864 if (cwd & 1)
1865 {
1866 ui32 t = 0xECu << i;
1867 new_sig |= t & inv_sig;
1868 }
1869 cwd >>= 1; ++cnt;
1870 }
1871
1872 sample_mask <<= 1;
1873 if (new_sig & sample_mask)
1874 {
1875 new_sig &= ~sample_mask;
1876 if (cwd & 1)
1877 {
1878 ui32 t = 0xC8u << i;
1879 new_sig |= t & inv_sig;
1880 }
1881 cwd >>= 1; ++cnt;
1882 }
1883 }
1884
1885 if (new_sig)
1886 {
1887 cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1888
1889 // Spread new_sig, such that each bit is in one byte with a
1890 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1891 __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
1892 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1893 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1894 new_sig_vec = _mm_and_si128(new_sig_vec,
1895 _mm_set1_epi64x((si64)0x8040201008040201));
1896 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1897 _mm_set1_epi64x((si64)0x8040201008040201));
1898
1899 // find cumulative sums
1900 // to find which bit in cwd we should extract
1901 __m128i inc_sum = new_sig_vec; // inclusive scan
1902 inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
1903 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1904 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1905 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1906 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1907 cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1908 // exclusive scan
1909 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1910
1911 // Spread cwd, such that each bit is in one byte
1912 // with a value of 0 or 1.
1913 cwd_vec = _mm_set1_epi16((si16)cwd);
1914 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1915 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1916 cwd_vec = _mm_and_si128(cwd_vec,
1917 _mm_set1_epi64x((si64)0x8040201008040201));
1918 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1919 _mm_set1_epi64x((si64)0x8040201008040201));
1920 cwd_vec = _mm_abs_epi8(cwd_vec);
1921
1922 // Obtain bit from cwd_vec correspondig to ex_sum
1923 // Basically, collect needed bits from cwd_vec
1924 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1925
1926 // load data and set spp coefficients
1927 __m128i m =
1928 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1929 __m128i val = _mm_set1_epi32(3 << (p - 2));
1930 ui32 *dp = dpp;
1931 for (int c = 0; c < 4; ++ c) {
1932 __m128i s0, s0_ns, s0_val;
1933 // load coefficients
1934 s0 = _mm_load_si128((__m128i*)dp);
1935
1936 // epi32 is -1 only for coefficient that
1937 // are changed during the SPP
1938 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1939 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1940
1941 // obtain sign for coefficients in SPP
1942 s0_val = _mm_shuffle_epi8(v, m);
1943 s0_val = _mm_slli_epi32(s0_val, 31);
1944 s0_val = _mm_or_si128(s0_val, val);
1945 s0_val = _mm_and_si128(s0_val, s0_ns);
1946
1947 // update vector
1948 s0 = _mm_or_si128(s0, s0_val);
1949 // store coefficients
1950 _mm_store_si128((__m128i*)dp, s0);
1951 // prepare for next row
1952 dp += stride;
1953 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1954 }
1955 }
1956 frwd_advance(&sigprop, cnt);
1957 }
1958
1959 new_sig |= cs;
1960 *prev_sig = (ui16)(new_sig);
1961
1962 // vertical integration for the new sig. info.
1963 t = new_sig;
1964 new_sig |= (t & 0x7777) << 1; //above neighbors
1965 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
1966 // add sig. info. from the row on top and below
1967 prev = new_sig | u;
1968 // we need only the bits in 0xF000
1969 prev &= 0xF000;
1970 }
1971 }
1972 }
1973
1974 // We perform Magnitude Refinement Pass here
1975 if (num_passes > 2)
1976 {
1977 rev_struct magref;
1978 rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
1979
1980 for (ui32 y = 0; y < height; y += 4)
1981 {
1982 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1983 ui32 *dpp = decoded_data + y * stride;
1984 for (ui32 i = 0; i < width; i += 4, dpp += 4)
1985 {
1986 //Process one entry from sigma array at a time
1987 // Each nibble (4 bits) in the sigma array represents 4 rows,
1988 ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
1989 ui16 sig = *cur_sig++; // 16 bit that will be processed now
1990 int total_bits = 0;
1991 if (sig) // if any of the 32 bits are set
1992 {
1993 // We work on 4 rows, with 4 samples each, since
1994 // data is 32 bit (4 bytes)
1995
1996 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
1997 __m128i sig_vec = _mm_set1_epi16((si16)sig);
1998 sig_vec = _mm_shuffle_epi8(sig_vec,
1999 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
2000 sig_vec = _mm_and_si128(sig_vec,
2001 _mm_set1_epi64x((si64)0x8040201008040201));
2002 sig_vec = _mm_cmpeq_epi8(sig_vec,
2003 _mm_set1_epi64x((si64)0x8040201008040201));
2004 sig_vec = _mm_abs_epi8(sig_vec);
2005
2006 // find cumulative sums
2007 // to find which bit in cwd we should extract
2008 __m128i inc_sum = sig_vec; // inclusive scan
2009 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
2010 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
2011 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
2012 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
2013 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
2014 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
2015
2016 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
2017 // cwd_vec. Then, convert these to a form suitable
2018 // for coefficient modifications; in particular, a value
2019 // of 0 is presented as binary 11, and a value of 1 is
2020 // represented as binary 01
2021 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
2022 cwd_vec = _mm_shuffle_epi8(cwd_vec,
2023 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
2024 cwd_vec = _mm_and_si128(cwd_vec,
2025 _mm_set1_epi64x((si64)0x8040201008040201));
2026 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2027 _mm_set1_epi64x((si64)0x8040201008040201));
2028 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2029 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2030 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2031
2032 // load data and insert the mrp bit
2033 __m128i m =
2034 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2035 ui32 *dp = dpp;
2036 for (int c = 0; c < 4; ++c) {
2037 __m128i s0, s0_sig, s0_idx, s0_val;
2038 // load coefficients
2039 s0 = _mm_load_si128((__m128i*)dp);
2040 // find significant samples in this row
2041 s0_sig = _mm_shuffle_epi8(sig_vec, m);
2042 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2043 // get MRP bit index, and MRP pattern
2044 s0_idx = _mm_shuffle_epi8(ex_sum, m);
2045 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2046 // keep data from significant samples only
2047 s0_val = _mm_andnot_si128(s0_sig, s0_val);
2048 // move mrp bits to correct position, and employ
2049 s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
2050 s0 = _mm_xor_si128(s0, s0_val);
2051 // store coefficients
2052 _mm_store_si128((__m128i*)dp, s0);
2053 // prepare for next row
2054 dp += stride;
2055 m = _mm_add_epi32(m, _mm_set1_epi32(1));
2056 }
2057 }
2058 // consume data according to the number of bits set
2059 rev_advance_mrp(&magref, (ui32)total_bits);
2060 }
2061 }
2062 }
2063 }
2064
2065 return true;
2066 }
2067 }
2068}
2069
2070#endif
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
bool ojph_decode_codeblock_ssse3(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
static void frwd_advance(frwd_struct32 *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct32.
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static void frwd_read(frwd_struct32 *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static v128_t decode_one_quad32(const v128_t inf_u_q, v128_t U_q, frwd_struct *magsgn, ui32 p, v128_t &vn)
decodes one quad, using 32 bit data
static ui32 frwd_fetch(frwd_struct32 *msp)
Fetches 32 bits from the frwd_struct32 bitstream.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static v128_t decode_two_quad16(const v128_t inf_u_q, v128_t U_q, frwd_struct *magsgn, ui32 p, v128_t &vn)
decodes twos consecutive quads (one octet), using 16 bit data
static void frwd_init(frwd_struct32 *msp, const ui8 *data, int size)
Initialize frwd_struct32 struct and reads some bytes.
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition ojph_arch.h:173
int32_t si32
Definition ojph_defs.h:55
int16_t si16
Definition ojph_defs.h:53
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define OJPH_WARN(t,...)
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data