OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_encoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_block_encoder_avx2.cpp
35//***************************************************************************/
36
37#include <cassert>
38#include <cstring>
39#include <cstdint>
40#include <climits>
41#include <immintrin.h>
42
43#include "ojph_mem.h"
44#include "ojph_arch.h"
45#include "ojph_block_encoder.h"
46#include "ojph_message.h"
47
48#ifdef OJPH_COMPILER_MSVC
49 #define likely(x) (x)
50 #define unlikely(x) (x)
51#else
52 #define likely(x) __builtin_expect((x), 1)
53 #define unlikely(x) __builtin_expect((x), 0)
54#endif
55
56namespace ojph {
57 namespace local {
58
60 // tables
62
63 //VLC encoding
64 // index is (c_q << 8) + (rho << 4) + eps
65 // data is (cwd << 8) + (cwd_len << 4) + eps
66 // table 0 is for the initial line of quads
67 static ui32 vlc_tbl0[2048];
68 static ui32 vlc_tbl1[2048];
69
70 //UVLC encoding
71 static ui32 ulvc_cwd_pre[33];
72 static int ulvc_cwd_pre_len[33];
73 static ui32 ulvc_cwd_suf[33];
74 static int ulvc_cwd_suf_len[33];
75
77 static bool vlc_init_tables()
78 {
79 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
80 vlc_src_table tbl0[] = {
81 #include "table0.h"
82 };
83 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
84
85 si32 pattern_popcnt[16];
86 for (ui32 i = 0; i < 16; ++i)
87 pattern_popcnt[i] = (si32)population_count(i);
88
89 vlc_src_table* src_tbl = tbl0;
90 ui32 *tgt_tbl = vlc_tbl0;
91 size_t tbl_size = tbl0_size;
92 for (int i = 0; i < 2048; ++i)
93 {
94 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
95 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
96 tgt_tbl[i] = 0;
97 else
98 {
99 vlc_src_table *best_entry = NULL;
100 if (emb) // u_off = 1
101 {
102 int best_e_k = -1;
103 for (size_t j = 0; j < tbl_size; ++j)
104 {
105 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
106 if (src_tbl[j].u_off == 1)
107 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
108 {
109 //now we need to find the smallest cwd with the highest
110 // number of bits set in e_k
111 int ones_count = pattern_popcnt[src_tbl[j].e_k];
112 if (ones_count >= best_e_k)
113 {
114 best_entry = src_tbl + j;
115 best_e_k = ones_count;
116 }
117 }
118 }
119 }
120 else // u_off = 0
121 {
122 for (size_t j = 0; j < tbl_size; ++j)
123 {
124 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
125 if (src_tbl[j].u_off == 0)
126 {
127 best_entry = src_tbl + j;
128 break;
129 }
130 }
131 }
132 assert(best_entry);
133 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
134 + best_entry->e_k);
135 }
136 }
137
138 vlc_src_table tbl1[] = {
139 #include "table1.h"
140 };
141 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
142
143 src_tbl = tbl1;
144 tgt_tbl = vlc_tbl1;
145 tbl_size = tbl1_size;
146 for (int i = 0; i < 2048; ++i)
147 {
148 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
149 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
150 tgt_tbl[i] = 0;
151 else
152 {
153 vlc_src_table *best_entry = NULL;
154 if (emb) // u_off = 1
155 {
156 int best_e_k = -1;
157 for (size_t j = 0; j < tbl_size; ++j)
158 {
159 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
160 if (src_tbl[j].u_off == 1)
161 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
162 {
163 //now we need to find the smallest cwd with the highest
164 // number of bits set in e_k
165 int ones_count = pattern_popcnt[src_tbl[j].e_k];
166 if (ones_count >= best_e_k)
167 {
168 best_entry = src_tbl + j;
169 best_e_k = ones_count;
170 }
171 }
172 }
173 }
174 else // u_off = 0
175 {
176 for (size_t j = 0; j < tbl_size; ++j)
177 {
178 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
179 if (src_tbl[j].u_off == 0)
180 {
181 best_entry = src_tbl + j;
182 break;
183 }
184 }
185 }
186 assert(best_entry);
187 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
188 + best_entry->e_k);
189 }
190 }
191
192
193 return true;
194 }
195
197 static bool uvlc_init_tables()
198 {
199 //code goes from 0 to 31, extension and 32 are not supported here
200 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
201 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
202 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
203 ulvc_cwd_pre_len[2] = 2;
204 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
205 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
206 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
207 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
208 ulvc_cwd_suf_len[2] = 0;
209 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
210 for (int i = 5; i < 33; ++i)
211 {
212 ulvc_cwd_pre[i] = 0;
213 ulvc_cwd_pre_len[i] = 3;
214 ulvc_cwd_suf[i] = (ui32)(i-5);
215 ulvc_cwd_suf_len[i] = 5;
216 }
217 return true;
218 }
219
221 static bool tables_initialized = false;
222
225 if (!tables_initialized) {
226 memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
227 memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
230 }
231 return tables_initialized;
232 }
233
235 //
237 struct mel_struct {
238 //storage
239 ui8* buf; //pointer to data buffer
240 ui32 pos; //position of next writing within buf
241 ui32 buf_size; //size of buffer, which we must not exceed
242
243 // all these can be replaced by bytes
244 int remaining_bits; //number of empty bits in tmp
245 int tmp; //temporary storage of coded bits
246 int run; //number of 0 run
247 int k; //state
248 int threshold; //threshold where one bit must be coded
249 };
250
252 static inline void
253 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
254 {
255 melp->buf = data;
256 melp->pos = 0;
257 melp->buf_size = buffer_size;
258 melp->remaining_bits = 8;
259 melp->tmp = 0;
260 melp->run = 0;
261 melp->k = 0;
262 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
263 }
264
266 static inline void
268 {
269 melp->tmp = (melp->tmp << 1) + v;
270 melp->remaining_bits--;
271 if (melp->remaining_bits == 0) {
272 melp->buf[melp->pos++] = (ui8)melp->tmp;
273 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
274 melp->tmp = 0;
275 }
276 }
277
279 static inline void
280 mel_encode(mel_struct* melp, bool bit)
281 {
282 //MEL exponent
283 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
284
285 if (bit == false) {
286 ++melp->run;
287 if (melp->run >= melp->threshold) {
288 mel_emit_bit(melp, 1);
289 melp->run = 0;
290 melp->k = ojph_min(12, melp->k + 1);
291 melp->threshold = 1 << mel_exp[melp->k];
292 }
293 } else {
294 mel_emit_bit(melp, 0);
295 int t = mel_exp[melp->k];
296 while (t > 0) {
297 mel_emit_bit(melp, (melp->run >> --t) & 1);
298 }
299 melp->run = 0;
300 melp->k = ojph_max(0, melp->k - 1);
301 melp->threshold = 1 << mel_exp[melp->k];
302 }
303 }
304
306 //
309 //storage
310 ui8* buf; //pointer to data buffer
311 ui32 pos; //position of next writing within buf
312 ui32 buf_size; //size of buffer, which we must not exceed
313
314 int used_bits; //number of occupied bits in tmp
315 ui64 tmp; //temporary storage of coded bits
316 bool last_greater_than_8F; //true if last byte us greater than 0x8F
317 };
318
320 static inline void
321 vlc_init(vlc_struct_avx2* vlcp, ui32 buffer_size, ui8* data)
322 {
323 vlcp->buf = data + buffer_size - 1; //points to last byte
324 vlcp->pos = 1; //locations will be all -pos
325 vlcp->buf_size = buffer_size;
326
327 vlcp->buf[0] = 0xFF;
328 vlcp->used_bits = 4;
329 vlcp->tmp = 0xF;
330 vlcp->last_greater_than_8F = true;
331 }
332
334 static inline void
335 vlc_encode(vlc_struct_avx2* vlcp, ui32 cwd, int cwd_len)
336 {
337 vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
338 vlcp->used_bits += cwd_len;
339
340 while (vlcp->used_bits >= 8) {
341 ui8 tmp;
342
343 if (unlikely(vlcp->last_greater_than_8F)) {
344 tmp = vlcp->tmp & 0x7F;
345
346 if (likely(tmp != 0x7F)) {
347 tmp = vlcp->tmp & 0xFF;
348 *(vlcp->buf - vlcp->pos) = tmp;
349 vlcp->last_greater_than_8F = tmp > 0x8F;
350 vlcp->tmp >>= 8;
351 vlcp->used_bits -= 8;
352 } else {
353 *(vlcp->buf - vlcp->pos) = tmp;
354 vlcp->last_greater_than_8F = false;
355 vlcp->tmp >>= 7;
356 vlcp->used_bits -= 7;
357 }
358
359 } else {
360 tmp = vlcp->tmp & 0xFF;
361 *(vlcp->buf - vlcp->pos) = tmp;
362 vlcp->last_greater_than_8F = tmp > 0x8F;
363 vlcp->tmp >>= 8;
364 vlcp->used_bits -= 8;
365 }
366
367 vlcp->pos++;
368 }
369 }
370
372 //
374 static inline void
376 {
377 if (melp->run > 0)
378 mel_emit_bit(melp, 1);
379
380 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
381 *(vlcp->buf - vlcp->pos) = 0x7f;
382 vlcp->pos++;
383 vlcp->tmp >>= 7;
384 vlcp->used_bits -= 7;
385 }
386
387 melp->tmp = melp->tmp << melp->remaining_bits;
388 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
389 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
390 if ((mel_mask | vlc_mask) == 0)
391 return; //last mel byte cannot be 0xFF, since then
392 //melp->remaining_bits would be < 8
393 if (melp->pos >= melp->buf_size)
394 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
395 ui8 vlcp_tmp = (ui8)vlcp->tmp;
396 int fuse = melp->tmp | vlcp_tmp;
397 if ( ( ((fuse ^ melp->tmp) & mel_mask)
398 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
399 && (fuse != 0xFF) && vlcp->pos > 1)
400 {
401 melp->buf[melp->pos++] = (ui8)fuse;
402 }
403 else
404 {
405 if (vlcp->pos >= vlcp->buf_size)
406 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
407 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
408 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
409 vlcp->pos++;
410 }
411 }
412
414//
416 struct ms_struct {
417 //storage
418 ui8* buf; //pointer to data buffer
419 ui32 pos; //position of next writing within buf
420 ui32 buf_size; //size of buffer, which we must not exceed
421
422 int max_bits; //maximum number of bits that can be store in tmp
423 int used_bits; //number of occupied bits in tmp
424 ui32 tmp; //temporary storage of coded bits
425 };
426
428 static inline void
429 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
430 {
431 msp->buf = data;
432 msp->pos = 0;
433 msp->buf_size = buffer_size;
434 msp->max_bits = 8;
435 msp->used_bits = 0;
436 msp->tmp = 0;
437 }
438
440 static inline void
441 ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
442 {
443 while (cwd_len > 0)
444 {
445 if (msp->pos >= msp->buf_size)
446 OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
447 int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
448 msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
449 msp->used_bits += t;
450 cwd >>= t;
451 cwd_len -= t;
452 if (msp->used_bits >= msp->max_bits)
453 {
454 msp->buf[msp->pos++] = (ui8)msp->tmp;
455 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
456 msp->tmp = 0;
457 msp->used_bits = 0;
458 }
459 }
460 }
461
463 static inline void
465 {
466 if (msp->used_bits)
467 {
468 int t = msp->max_bits - msp->used_bits; //unused bits
469 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
470 msp->used_bits += t;
471 if (msp->tmp != 0xFF)
472 {
473 if (msp->pos >= msp->buf_size)
474 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
475 msp->buf[msp->pos++] = (ui8)msp->tmp;
476 }
477 }
478 else if (msp->max_bits == 7)
479 msp->pos--;
480 }
481
482#define ZERO _mm256_setzero_si256()
483#define ONE _mm256_set1_epi32(1)
484
485// https://stackoverflow.com/a/58827596
486inline __m256i avx2_lzcnt_epi32(__m256i v) {
487 // prevent value from being rounded up to the next power of two
488 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
489
490 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
491 v = _mm256_srli_epi32(v, 23); // shift down the exponent
492 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
493 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
494
495 return v;
496}
497
498inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
499 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
500}
501
502static void proc_pixel(__m256i *src_vec, ui32 p,
503 __m256i *eq_vec, __m256i *s_vec,
504 __m256i &rho_vec, __m256i &e_qmax_vec)
505{
506 __m256i val_vec[4];
507 __m256i _eq_vec[4];
508 __m256i _s_vec[4];
509 __m256i _rho_vec[4];
510
511 for (ui32 i = 0; i < 4; ++i) {
512 /* val = t + t; //multiply by 2 and get rid of sign */
513 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
514
515 /* val >>= p; // 2 \mu_p + x */
516 val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p);
517
518 /* val &= ~1u; // 2 \mu_p */
519 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u));
520
521 /* if (val) { */
522 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
523
524 /* rho[i] = 1 << i;
525 * rho is processed below.
526 */
527
528 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
529 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
530 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
531 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
532
533 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
534 * e_qmax is processed below
535 */
536
537 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
538 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
539 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
540 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
541
542 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
543 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
544 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
545 /* } */
546 }
547
548 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
549
550 /* Reorder from
551 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7]
552 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7]
553 * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15]
554 * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15]
555 * to
556 * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14]
557 * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14]
558 * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15]
559 * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15]
560 */
561 __m256i tmp1, tmp2;
562 for (ui32 i = 0; i < 2; ++i) {
563 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
564 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
565 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
566 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
567
568 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
569 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
570 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
571 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
572
573 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
574 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
575 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
576 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
577 }
578
579 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
580 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
581 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
582 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
583 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
584 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
585 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
586 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
587 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
588}
589
590/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
591 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
592 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
593 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
594 *
595 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
596 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
597 *
598 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
599 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
600 *
601 * [..]
602 */
603static void rotate_matrix(__m256i *matrix)
604{
605 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
606 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
607 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
608 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
609
610 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
611 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
612 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
613 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
614
615 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
616 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
617 matrix[0] = tmp1;
618
619 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
620 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
621 matrix[1] = tmp1;
622}
623
624static void proc_ms_encode(ms_struct *msp,
625 __m256i &tuple_vec,
626 __m256i &uq_vec,
627 __m256i &rho_vec,
628 __m256i *s_vec)
629{
630 __m256i m_vec[4];
631
632 /* Prepare parameters for ms_encode */
633 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
634 auto tmp = _mm256_and_si256(tuple_vec, ONE);
635 tmp = _mm256_sub_epi32(uq_vec, tmp);
636 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
637 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
638 m_vec[0] = _mm256_and_si256(mask, tmp);
639
640 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
641 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
642 tmp = _mm256_srli_epi32(tmp, 1);
643 tmp = _mm256_sub_epi32(uq_vec, tmp);
644 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
645 mask = avx2_cmpneq_epi32(tmp1, ZERO);
646 m_vec[1] = _mm256_and_si256(mask, tmp);
647
648 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
649 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
650 tmp = _mm256_srli_epi32(tmp, 2);
651 tmp = _mm256_sub_epi32(uq_vec, tmp);
652 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
653 mask = avx2_cmpneq_epi32(tmp1, ZERO);
654 m_vec[2] = _mm256_and_si256(mask, tmp);
655
656 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
657 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
658 tmp = _mm256_srli_epi32(tmp, 3);
659 tmp = _mm256_sub_epi32(uq_vec, tmp);
660 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
661 mask = avx2_cmpneq_epi32(tmp1, ZERO);
662 m_vec[3] = _mm256_and_si256(mask, tmp);
663
664 rotate_matrix(m_vec);
665 /* s_vec from
666 * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
667 * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
668 * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
669 * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
670 * to
671 * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
672 * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
673 * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
674 * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
675 */
676 rotate_matrix(s_vec);
677
678 ui32 cwd[8];
679 int cwd_len[8];
680 ui64 _cwd = 0;
681 int _cwd_len = 0;
682
683 /* Each iteration process 8 bytes * 2 lines */
684 for (ui32 i = 0; i < 4; ++i) {
685 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
686 * cwd_len = m
687 */
688 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
689 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
690 tmp = _mm256_sub_epi32(tmp, ONE);
691 tmp = _mm256_and_si256(tmp, s_vec[i]);
692 _mm256_storeu_si256((__m256i*)cwd, tmp);
693
694 for (ui32 j = 0; j < 4; ++j) {
695 ui32 idx = j * 2;
696 _cwd = cwd[idx];
697 _cwd_len = cwd_len[idx];
698 _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len;
699 _cwd_len += cwd_len[idx + 1];
700 ms_encode(msp, _cwd, _cwd_len);
701 }
702 }
703}
704
705static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
706 __m256i &e_qmax_vec)
707{
708 /* if (u_q[i] > 0) {
709 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
710 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
711 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
712 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
713 * }
714 */
715 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
716
717 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
718 auto eps_vec = _mm256_srli_epi32(mask, 31);
719
720 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
721 auto tmp = _mm256_srli_epi32(mask, 31);
722 tmp = _mm256_slli_epi32(tmp, 1);
723 eps_vec = _mm256_or_si256(eps_vec, tmp);
724
725 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
726 tmp = _mm256_srli_epi32(mask, 31);
727 tmp = _mm256_slli_epi32(tmp, 2);
728 eps_vec = _mm256_or_si256(eps_vec, tmp);
729
730 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
731 tmp = _mm256_srli_epi32(mask, 31);
732 tmp = _mm256_slli_epi32(tmp, 3);
733 eps_vec = _mm256_or_si256(eps_vec, tmp);
734
735 return _mm256_and_si256(u_q_mask, eps_vec);
736}
737
738static void update_lep(ui32 x, __m256i &prev_e_val_vec,
739 __m256i *eq_vec, __m256i *e_val_vec,
740 const __m256i left_shift)
741{
742 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
743 * lep[0] = (ui8)e_q[3];
744 * Compare e_q[1] with e_q[3] of the prevous round.
745 */
746 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
747 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
748 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
749 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
750}
751
752
753static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec,
754 __m256i &rho_vec, __m256i *cx_val_vec,
755 const __m256i left_shift)
756{
757 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
758 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
759 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
760 */
761 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
762 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
763 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
764
765 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
766 tmp = _mm256_srli_epi32(tmp, 3);
767
768 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
769 tmp1 = _mm256_srli_epi32(tmp1, 1);
770 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
771}
772
773static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
774 __m256i &eps_vec, ui32 *vlc_tbl)
775{
776 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
777 auto tmp = _mm256_slli_epi32(cq_vec, 8);
778 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
779 tmp = _mm256_add_epi32(tmp, tmp1);
780 tmp = _mm256_add_epi32(tmp, eps_vec);
781 return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4);
782}
783
784static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
785 const __m256i right_shift)
786{
787 ojph_unused(x);
788 ojph_unused(cx_val_vec);
789 ojph_unused(right_shift);
790
791 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
792 auto tmp = _mm256_srli_epi32(rho_vec, 1);
793 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
794 return _mm256_or_si256(tmp, tmp1);
795}
796
797static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
798 const __m256i right_shift)
799{
800 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
801 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
802 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
803 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
804
805 tmp = _mm256_insert_epi64(tmp, _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
806 tmp = _mm256_slli_epi32(tmp, 2);
807 auto tmp1 = _mm256_insert_epi32(lcxp1_vec, _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
808 tmp = _mm256_add_epi32(tmp1, tmp);
809
810 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
811 tmp1 = _mm256_srli_epi32(tmp1, 1);
812 tmp = _mm256_or_si256(tmp, tmp1);
813
814 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
815 tmp1 = _mm256_srli_epi32(tmp1, 2);
816
817 return _mm256_or_si256(tmp, tmp1);
818}
819
820using fn_proc_cq = __m256i (*)(ui32, __m256i *, __m256i &, const __m256i);
821
822static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec,
823 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
824 const __m256i right_shift)
825{
826 int32_t mel_need_encode[8];
827 int32_t mel_need_encode2[8];
828 int32_t mel_bit[8];
829 int32_t mel_bit2[8];
830 /* Prepare mel_encode params */
831 /* if (c_q[i] == 0) { */
832 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
833 /* mel_encode(&mel, rho[i] != 0); */
834 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
835 /* } */
836
837 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
838 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
839 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
840 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
841
842 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
843 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
844 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
845
846 ui32 i_max = 8 - (ignore / 2);
847
848 for (ui32 i = 0; i < i_max; i += 2) {
849 if (mel_need_encode[i]) {
850 mel_encode(melp, mel_bit[i]);
851 }
852
853 if (i + 1 < i_max) {
854 if (mel_need_encode[i + 1]) {
855 mel_encode(melp, mel_bit[i + 1]);
856 }
857 }
858
859 if (mel_need_encode2[i]) {
860 mel_encode(melp, mel_bit2[i]);
861 }
862 }
863}
864
865static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec,
866 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
867 const __m256i right_shift)
868{
869 ojph_unused(u_q_vec);
870 ojph_unused(right_shift);
871 int32_t mel_need_encode[8];
872 int32_t mel_bit[8];
873
874 /* Prepare mel_encode params */
875 /* if (c_q[i] == 0) { */
876 _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
877 /* mel_encode(&mel, rho[i] != 0); */
878 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
879 /* } */
880
881 ui32 i_max = 8 - (ignore / 2);
882
883 for (ui32 i = 0; i < i_max; ++i) {
884 if (mel_need_encode[i]) {
885 mel_encode(melp, mel_bit[i]);
886 }
887 }
888}
889
890using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &,
891 __m256i, ui32, const __m256i);
892
893static void proc_vlc_encode1(vlc_struct_avx2 *vlcp, ui32 *tuple,
894 ui32 *u_q, ui32 ignore)
895{
896 ui32 i_max = 8 - (ignore / 2);
897
898 for (ui32 i = 0; i < i_max; i += 2) {
899 /* 7 bits */
900 ui32 val = tuple[i + 0] >> 4;
901 int size = tuple[i + 0] & 7;
902
903 if (i + 1 < i_max) {
904 /* 7 bits */
905 val |= (tuple[i + 1] >> 4) << size;
906 size += tuple[i + 1] & 7;
907 }
908
909 if (u_q[i] > 2 && u_q[i + 1] > 2) {
910 /* 3 bits */
911 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
912 size += ulvc_cwd_pre_len[u_q[i] - 2];
913
914 /* 3 bits */
915 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
916 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
917
918 /* 5 bits */
919 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
920 size += ulvc_cwd_suf_len[u_q[i] - 2];
921
922 /* 5 bits */
923 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
924 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
925
926 } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
927 /* 3 bits */
928 val |= (ulvc_cwd_pre[u_q[i]]) << size;
929 size += ulvc_cwd_pre_len[u_q[i]];
930
931 /* 1 bit */
932 val |= (u_q[i + 1] - 1) << size;
933 size += 1;
934
935 /* 5 bits */
936 val |= (ulvc_cwd_suf[u_q[i]]) << size;
937 size += ulvc_cwd_suf_len[u_q[i]];
938
939 } else {
940 /* 3 bits */
941 val |= (ulvc_cwd_pre[u_q[i]]) << size;
942 size += ulvc_cwd_pre_len[u_q[i]];
943
944 /* 3 bits */
945 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
946 size += ulvc_cwd_pre_len[u_q[i + 1]];
947
948 /* 5 bits */
949 val |= (ulvc_cwd_suf[u_q[i]]) << size;
950 size += ulvc_cwd_suf_len[u_q[i]];
951
952 /* 5 bits */
953 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
954 size += ulvc_cwd_suf_len[u_q[i + 1]];
955 }
956
957 vlc_encode(vlcp, val, size);
958 }
959}
960
961static void proc_vlc_encode2(vlc_struct_avx2 *vlcp, ui32 *tuple,
962 ui32 *u_q, ui32 ignore)
963{
964 ui32 i_max = 8 - (ignore / 2);
965
966 for (ui32 i = 0; i < i_max; i += 2) {
967 /* 7 bits */
968 ui32 val = tuple[i + 0] >> 4;
969 int size = tuple[i + 0] & 7;
970
971 if (i + 1 < i_max) {
972 /* 7 bits */
973 val |= (tuple[i + 1] >> 4) << size;
974 size += tuple[i + 1] & 7;
975 }
976
977 /* 3 bits */
978 val |= ulvc_cwd_pre[u_q[i]] << size;
979 size += ulvc_cwd_pre_len[u_q[i]];
980
981 /* 3 bits */
982 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
983 size += ulvc_cwd_pre_len[u_q[i + 1]];
984
985 /* 5 bits */
986 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
987 size += ulvc_cwd_suf_len[u_q[i + 0]];
988
989 /* 5 bits */
990 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
991 size += ulvc_cwd_suf_len[u_q[i + 1]];
992
993 vlc_encode(vlcp, val, size);
994 }
995}
996
998
999void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
1000 ui32 num_passes, ui32 _width, ui32 height,
1001 ui32 stride, ui32* lengths,
1003 ojph::coded_lists *& coded)
1004{
1005 ojph_unused(num_passes); //currently not used
1006
1007 ui32 width = (_width + 15) & ~15u;
1008 ui32 ignore = width - _width;
1009 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1010 const int mel_vlc_size = 3072; //more than enough
1011 const int mel_size = 192;
1012 const int vlc_size = mel_vlc_size - mel_size;
1013
1014 ui8 ms_buf[ms_size];
1015 ui8 mel_vlc_buf[mel_vlc_size];
1016 ui8 *mel_buf = mel_vlc_buf;
1017 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1018
1019 mel_struct mel;
1020 mel_init(&mel, mel_size, mel_buf);
1021 vlc_struct_avx2 vlc;
1022 vlc_init(&vlc, vlc_size, vlc_buf);
1023 ms_struct ms;
1024 ms_init(&ms, ms_size, ms_buf);
1025
1026 const ui32 p = 30 - missing_msbs;
1027
1028 //e_val: E values for a line (these are the highest set bit)
1029 //cx_val: is the context values
1030 //Each byte stores the info for the 2 sample. For E, it is maximum
1031 // of the two samples, while for cx, it is the OR of these two samples.
1032 //The maximum is between the pixel at the bottom left of one quad
1033 // and the bottom right of the earlier quad. The same is true for cx.
1034 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1035 // one for the non-existing earlier quad, and one for beyond the
1036 // the end
1037 const __m256i right_shift = _mm256_set_epi32(
1038 0, 7, 6, 5, 4, 3, 2, 1
1039 );
1040
1041 const __m256i left_shift = _mm256_set_epi32(
1042 6, 5, 4, 3, 2, 1, 0, 7
1043 );
1044
1045 ui32 n_loop = (width + 15) / 16;
1046
1047 __m256i e_val_vec[65];
1048 for (ui32 i = 0; i <ojph_min(64, n_loop); ++i) {
1049 e_val_vec[i] = ZERO;
1050 }
1051 __m256i prev_e_val_vec = ZERO;
1052
1053 __m256i cx_val_vec[65];
1054 __m256i prev_cx_val_vec = ZERO;
1055
1056 ui32 prev_cq = 0;
1057
1058 __m256i eq_vec[4];
1059 __m256i s_vec[4];
1060 __m256i src_vec[4];
1061
1062 ui32 *vlc_tbl = vlc_tbl0;
1063 fn_proc_cq proc_cq = proc_cq1;
1064 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1065 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1066
1067 /* 2 lines per iteration */
1068 for (ui32 y = 0; y < height; y += 2)
1069 {
1070 e_val_vec[n_loop] = prev_e_val_vec;
1071 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1072 __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1073 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1074
1075 prev_e_val_vec = ZERO;
1076 prev_cx_val_vec = ZERO;
1077
1078 ui32 *sp = buf + y * stride;
1079
1080 /* 16 bytes per iteration */
1081 for (ui32 x = 0; x < n_loop; ++x) {
1082
1083 /* t = sp[i]; */
1084 if ((x == (n_loop - 1)) && (_width % 16)) {
1085 ui32 tmp_buf[16] = { 0 };
1086 memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32));
1087 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1088 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1089 if (y + 1 < height) {
1090 memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32));
1091 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1092 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1093 }
1094 else {
1095 src_vec[1] = ZERO;
1096 src_vec[3] = ZERO;
1097 }
1098 }
1099 else {
1100 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1101 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1102
1103 if (y + 1 < height) {
1104 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1105 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1106 }
1107 else {
1108 src_vec[1] = ZERO;
1109 src_vec[3] = ZERO;
1110 }
1111 sp += 16;
1112 }
1113
1114 /* src_vec layout:
1115 * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5],.[0, 6],.[0, 7]
1116 * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5],.[1, 6],.[1, 7]
1117 * src_vec[2]:[0, 8],[0, 9],[0,10],[0,11],[0,12],[0,13],.[0,14], [0,15]
1118 * src_vec[3]:[1, 8],[1, 9],[1,10],[1,11],[1,12],[1,13],.[1,14], [1,15]
1119 */
1120 __m256i rho_vec, e_qmax_vec;
1121 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1122
1123 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1124 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1125 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1126
1127 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1128 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1129
1130 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1131 tmp = _mm256_max_epi32(max_e_vec, ONE);
1132 __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1133 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1134
1135 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1136 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1137 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1138 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1139
1140 /* cq[1 - 16] = cq_vec
1141 * cq[0] = prev_cq_vec[0]
1142 */
1143 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1144
1145 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1146 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1147 prev_cq = (ui32)_mm256_extract_epi32(tmp, 7);
1148
1149 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1150 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1151
1152 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1153 /* u_q[i] = Uq[i] - kappa[i]; */
1154 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1155 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1156
1157 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1158 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1159 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1160
1161 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1162 right_shift);
1163
1164 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1165
1166 // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
1167 // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
1168 ui32 u_q[8];
1169 ui32 tuple[8];
1170 /* The tuple is scaled by 4 due to:
1171 * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
1172 * So in the vlc_encode, the tuple will only be scaled by 2.
1173 */
1174 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1175 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1176 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1177
1178 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1179 }
1180
1181 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1182 tmp = _mm256_slli_epi32(tmp, 2);
1183 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1184 prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1185
1186 proc_cq = proc_cq2;
1187 vlc_tbl = vlc_tbl1;
1188 proc_mel_encode = proc_mel_encode2;
1189 proc_vlc_encode = proc_vlc_encode2;
1190 }
1191
1192 ms_terminate(&ms);
1193 terminate_mel_vlc(&mel, &vlc);
1194
1195 //copy to elastic
1196 lengths[0] = mel.pos + vlc.pos + ms.pos;
1197 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1198 memcpy(coded->buf, ms.buf, ms.pos);
1199 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1200 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1201
1202 // put in the interface locator word
1203 ui32 num_bytes = mel.pos + vlc.pos;
1204 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1205 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1206 coded->buf[lengths[0]-2] =
1207 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1208
1209 coded->avail_size -= lengths[0];
1210}
1211
1212} /* namespace local */
1213} /* namespace ojph */
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition ojph_mem.cpp:91
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
static void ms_terminate(ms_struct *msp)
static int ulvc_cwd_suf_len[33]
static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, const __m256i right_shift)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void proc_pixel(__m256i *src_vec, ui32 p, __m256i *eq_vec, __m256i *s_vec, __m256i &rho_vec, __m256i &e_qmax_vec)
static ui32 ulvc_cwd_suf[33]
static void proc_ms_encode(ms_struct *msp, __m256i &tuple_vec, __m256i &uq_vec, __m256i &rho_vec, __m256i *s_vec)
void(*)(mel_struct *, __m256i &, __m256i &, __m256i, ui32, const __m256i) fn_proc_mel_encode
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
static void update_lep(ui32 x, __m256i &prev_e_val_vec, __m256i *eq_vec, __m256i *e_val_vec, const __m256i left_shift)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, const __m256i right_shift)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec, __m256i &e_qmax_vec)
static void rotate_matrix(__m256i *matrix)
static ui32 ulvc_cwd_pre[33]
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
__m256i(*)(ui32, __m256i *, __m256i &, const __m256i) fn_proc_cq
static int ulvc_cwd_pre_len[33]
static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec, __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, const __m256i right_shift)
static void proc_vlc_encode2(vlc_struct_avx2 *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec, __m256i &rho_vec, __m256i *cx_val_vec, const __m256i left_shift)
__m256i avx2_cmpneq_epi32(__m256i v, __m256i v2)
__m256i avx2_lzcnt_epi32(__m256i v)
static bool tables_initialized
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
void(*)(vlc_struct_avx2 *, ui32 *, ui32 *, ui32) fn_proc_vlc_encode
static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec, __m256i &eps_vec, ui32 *vlc_tbl)
static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec, __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, const __m256i right_shift)
static void proc_vlc_encode1(vlc_struct_avx2 *vlcp, ui32 *tuple, ui32 *u_q, ui32 ignore)
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition ojph_arch.h:152
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define likely(x)
#define unlikely(x)
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define ojph_min(a, b)
Definition ojph_defs.h:76
#define ojph_unused(x)
Definition ojph_defs.h:78
#define OJPH_ERROR(t,...)