79 struct vlc_src_table {
int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
80 vlc_src_table tbl0[] = {
83 size_t tbl0_size =
sizeof(tbl0) /
sizeof(vlc_src_table);
85 si32 pattern_popcnt[16];
86 for (
ui32 i = 0; i < 16; ++i)
89 vlc_src_table* src_tbl = tbl0;
91 size_t tbl_size = tbl0_size;
92 for (
int i = 0; i < 2048; ++i)
94 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
95 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
99 vlc_src_table *best_entry = NULL;
103 for (
size_t j = 0; j < tbl_size; ++j)
105 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
106 if (src_tbl[j].u_off == 1)
107 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
111 int ones_count = pattern_popcnt[src_tbl[j].e_k];
112 if (ones_count >= best_e_k)
114 best_entry = src_tbl + j;
115 best_e_k = ones_count;
122 for (
size_t j = 0; j < tbl_size; ++j)
124 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
125 if (src_tbl[j].u_off == 0)
127 best_entry = src_tbl + j;
133 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
138 vlc_src_table tbl1[] = {
141 size_t tbl1_size =
sizeof(tbl1) /
sizeof(vlc_src_table);
145 tbl_size = tbl1_size;
146 for (
int i = 0; i < 2048; ++i)
148 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
149 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
153 vlc_src_table *best_entry = NULL;
157 for (
size_t j = 0; j < tbl_size; ++j)
159 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
160 if (src_tbl[j].u_off == 1)
161 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
165 int ones_count = pattern_popcnt[src_tbl[j].e_k];
166 if (ones_count >= best_e_k)
168 best_entry = src_tbl + j;
169 best_e_k = ones_count;
176 for (
size_t j = 0; j < tbl_size; ++j)
178 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
179 if (src_tbl[j].u_off == 0)
181 best_entry = src_tbl + j;
187 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
501 __m512i *eq_vec, __m512i *s_vec,
502 __m512i &rho_vec, __m512i &e_qmax_vec)
510 for (
ui32 i = 0; i < 4; ++i) {
512 val_vec[i] = _mm512_add_epi32(src_vec[i], src_vec[i]);
515 val_vec[i] = _mm512_srli_epi32(val_vec[i], p);
518 val_vec[i] = _mm512_and_epi32(val_vec[i], _mm512_set1_epi32((
int)~1u));
521 val_mask[i] = _mm512_cmpneq_epi32_mask(val_vec[i],
ZERO);
528 val_vec[i] = _mm512_mask_sub_epi32(
ZERO, val_mask[i], val_vec[i],
ONE);
529 _eq_vec[i] = _mm512_mask_lzcnt_epi32(
ZERO, val_mask[i], val_vec[i]);
530 _eq_vec[i] = _mm512_mask_sub_epi32(
ZERO, val_mask[i],
531 _mm512_set1_epi32(32), _eq_vec[i]);
538 val_vec[i] = _mm512_mask_sub_epi32(
ZERO, val_mask[i], val_vec[i],
ONE);
539 _s_vec[i] = _mm512_mask_srli_epi32(
ZERO, val_mask[i], src_vec[i], 31);
541 _mm512_mask_add_epi32(
ZERO, val_mask[i], _s_vec[i], val_vec[i]);
545 val_vec[0] = _mm512_mask_mov_epi32(
ZERO, val_mask[0],
ONE);
546 val_vec[1] = _mm512_mask_mov_epi32(
ZERO, val_mask[1],
ONE);
547 val_vec[2] = _mm512_mask_mov_epi32(
ZERO, val_mask[2],
ONE);
548 val_vec[3] = _mm512_mask_mov_epi32(
ZERO, val_mask[3],
ONE);
551 const __m512i idx[2] = {
552 _mm512_set_epi32(14, 12, 10, 8, 6, 4, 2, 0, 14, 12, 10, 8, 6, 4, 2, 0),
553 _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 15, 13, 11, 9, 7, 5, 3, 1),
567 for (
ui32 i = 0; i < 4; ++i) {
569 ui32 o_idx = i & 0x1;
571 eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
572 eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
576 s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
577 s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
581 _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
582 _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
585 _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
587 e_qmax_vec = _mm512_max_epi32(e_qmax_vec, eq_vec[i]);
590 rho_vec = _mm512_or_epi32(_rho_vec[0], _rho_vec[1]);
591 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[2]);
592 rho_vec = _mm512_or_epi32(rho_vec, _rho_vec[3]);
611 _matrix[0] = _mm512_unpacklo_epi32(matrix[0], matrix[1]);
612 _matrix[1] = _mm512_unpackhi_epi32(matrix[0], matrix[1]);
613 _matrix[2] = _mm512_unpacklo_epi32(matrix[2], matrix[3]);
614 _matrix[3] = _mm512_unpackhi_epi32(matrix[2], matrix[3]);
616 matrix[0] = _mm512_unpacklo_epi64(_matrix[0], _matrix[2]);
617 matrix[1] = _mm512_unpackhi_epi64(_matrix[0], _matrix[2]);
618 matrix[2] = _mm512_unpacklo_epi64(_matrix[1], _matrix[3]);
619 matrix[3] = _mm512_unpackhi_epi64(_matrix[1], _matrix[3]);
621 _matrix[0] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0x88);
622 _matrix[1] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0x88);
623 _matrix[2] = _mm512_shuffle_i32x4(matrix[0], matrix[1], 0xDD);
624 _matrix[3] = _mm512_shuffle_i32x4(matrix[2], matrix[3], 0xDD);
626 matrix[0] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0x88);
627 matrix[1] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0x88);
628 matrix[2] = _mm512_shuffle_i32x4(_matrix[0], _matrix[1], 0xDD);
629 matrix[3] = _mm512_shuffle_i32x4(_matrix[2], _matrix[3], 0xDD);
642 auto tmp = _mm512_and_epi32(tuple_vec,
ONE);
643 tmp = _mm512_sub_epi32(uq_vec, tmp);
644 auto tmp1 = _mm512_and_epi32(rho_vec,
ONE);
645 auto mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
646 m_vec[0] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
649 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(2));
650 tmp = _mm512_srli_epi32(tmp, 1);
651 tmp = _mm512_sub_epi32(uq_vec, tmp);
652 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(2));
653 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
654 m_vec[1] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
657 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(4));
658 tmp = _mm512_srli_epi32(tmp, 2);
659 tmp = _mm512_sub_epi32(uq_vec, tmp);
660 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
661 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
662 m_vec[2] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
665 tmp = _mm512_and_epi32(tuple_vec, _mm512_set1_epi32(8));
666 tmp = _mm512_srli_epi32(tmp, 3);
667 tmp = _mm512_sub_epi32(uq_vec, tmp);
668 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
669 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
670 m_vec[3] = _mm512_mask_mov_epi32(
ZERO, mask, tmp);
692 for (
ui32 i = 0; i < 4; ++i) {
696 _mm512_store_epi32(cwd_len, m_vec[i]);
697 tmp = _mm512_sllv_epi32(
ONE, m_vec[i]);
698 tmp = _mm512_sub_epi32(tmp,
ONE);
699 tmp = _mm512_and_epi32(tmp, s_vec[i]);
700 _mm512_store_epi32(cwd, tmp);
702 for (
ui32 j = 0; j < 8; ++j) {
705 _cwd_len = cwd_len[idx];
706 _cwd |= ((
ui64)cwd[idx + 1]) << _cwd_len;
707 _cwd_len += cwd_len[idx + 1];
723 auto u_q_mask = _mm512_cmpgt_epi32_mask(u_q_vec,
ZERO);
725 auto mask = _mm512_cmpeq_epi32_mask(eq_vec[0], e_qmax_vec);
726 auto tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
727 auto eps_vec = _mm512_mask_mov_epi32(
ZERO, u_q_mask, tmp);
729 mask = _mm512_cmpeq_epi32_mask(eq_vec[1], e_qmax_vec);
730 tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
731 tmp = _mm512_slli_epi32(tmp, 1);
732 eps_vec = _mm512_mask_or_epi32(
ZERO, u_q_mask, eps_vec, tmp);
734 mask = _mm512_cmpeq_epi32_mask(eq_vec[2], e_qmax_vec);
735 tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
736 tmp = _mm512_slli_epi32(tmp, 2);
737 eps_vec = _mm512_mask_or_epi32(
ZERO, u_q_mask, eps_vec, tmp);
739 mask = _mm512_cmpeq_epi32_mask(eq_vec[3], e_qmax_vec);
740 tmp = _mm512_mask_mov_epi32(
ZERO, mask,
ONE);
741 tmp = _mm512_slli_epi32(tmp, 3);
743 return _mm512_mask_or_epi32(
ZERO, u_q_mask, eps_vec, tmp);
808 const __m512i right_shift)
812 auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
813 auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
814 auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
815 tmp = _mm512_mask_permutexvar_epi32(tmp, 0xC000, right_shift, lcxp2_vec);
816 tmp = _mm512_slli_epi32(tmp, 2);
817 auto tmp1 = _mm512_mask_mov_epi32(lcxp1_vec, 0x8000, lcxp2_vec);
818 tmp = _mm512_add_epi32(tmp1, tmp);
820 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(4));
821 tmp1 = _mm512_srli_epi32(tmp1, 1);
822 tmp = _mm512_or_epi32(tmp, tmp1);
824 tmp1 = _mm512_and_epi32(rho_vec, _mm512_set1_epi32(8));
825 tmp1 = _mm512_srli_epi32(tmp1, 2);
827 return _mm512_or_epi32(tmp, tmp1);
833 __m512i &rho_vec, __m512i u_q_vec,
ui32 ignore,
834 const __m512i right_shift)
838 auto mel_need_encode = _mm512_cmpeq_epi32_mask(cq_vec,
ZERO);
840 auto mel_bit = _mm512_cmpneq_epi32_mask(rho_vec,
ZERO);
844 auto tmp = _mm512_permutexvar_epi32(right_shift, u_q_vec);
845 auto tmp1 = _mm512_min_epi32(u_q_vec, tmp);
846 auto mel_bit2 = (
ui16)_mm512_cmpgt_epi32_mask(tmp1, _mm512_set1_epi32(2));
849 auto mel_need_encode2 = (
ui16)_mm512_cmpgt_epi32_mask(u_q_vec,
ZERO);
851 mel_need_encode2 & (
ui16)_mm512_cmpgt_epi32_mask(tmp,
ZERO);
853 ui32 i_max = 16 - (ignore / 2);
855 for (
ui32 i = 0; i < i_max; i += 2) {
857 if (0 != (mel_need_encode & mask)) {
862 auto mask = 1 << (i + 1);
863 if (0 != (mel_need_encode & mask)) {
868 if (0 != (mel_need_encode2 & mask)) {
1015 ui32 width = (_width + 31) & ~31u;
1016 ui32 ignore = width - _width;
1017 const int ms_size = (16384 * 16 + 14) / 15;
1018 const int mel_vlc_size = 3072;
1019 const int mel_size = 192;
1020 const int vlc_size = mel_vlc_size - mel_size;
1022 ui8 ms_buf[ms_size];
1023 ui8 mel_vlc_buf[mel_vlc_size];
1024 ui8 *mel_buf = mel_vlc_buf;
1025 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1032 ms_init(&ms, ms_size, ms_buf);
1034 ui32 p = 30 - missing_msbs;
1045 const __m512i right_shift = _mm512_set_epi32(
1046 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
1049 const __m512i left_shift = _mm512_set_epi32(
1050 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15
1053 __m512i e_val_vec[33];
1054 for (
ui32 i = 0; i < 32; ++i) {
1055 e_val_vec[i] =
ZERO;
1057 __m512i prev_e_val_vec =
ZERO;
1059 __m512i cx_val_vec[33];
1060 __m512i prev_cx_val_vec =
ZERO;
1062 __m512i prev_cq_vec =
ZERO;
1074 ui32 n_loop = (width + 31) / 32;
1082 for (
ui32 y = 0; y < height; y += 2)
1084 e_val_vec[n_loop] = prev_e_val_vec;
1086 tmp = _mm512_and_epi32(prev_cx_val_vec, _mm512_set1_epi32(8));
1087 tmp = _mm512_srli_epi32(tmp, 3);
1088 cx_val_vec[n_loop] = tmp;
1090 prev_e_val_vec =
ZERO;
1091 prev_cx_val_vec =
ZERO;
1093 ui32 *sp = buf + y * stride;
1096 for (
ui32 x = 0; x < n_loop; ++x) {
1100 ui32 mask32 = 0xFFFFFFFFu;
1101 si32 entries = true_x + 32 - (
si32)_width;
1102 mask32 >>= ((entries >= 0) ? entries : 0);
1103 __mmask16 load_mask0 = _cvtu32_mask16(mask32);
1104 __mmask16 load_mask1 = _cvtu32_mask16(mask32 >> 16);
1107 src_vec[0] = _mm512_maskz_loadu_epi32(load_mask0, sp);
1108 src_vec[2] = _mm512_maskz_loadu_epi32(load_mask1, sp + 16);
1110 if (y + 1 < height) {
1111 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1113 _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
1126 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1129 tmp = _mm512_permutexvar_epi32(right_shift, e_val_vec[x]);
1130 tmp = _mm512_mask_permutexvar_epi32(tmp, 0x8000, right_shift,
1132 auto mask = _mm512_cmpgt_epi32_mask(e_val_vec[x], tmp);
1133 auto max_e_vec = _mm512_mask_mov_epi32(tmp, mask, e_val_vec[x]);
1134 max_e_vec = _mm512_sub_epi32(max_e_vec,
ONE);
1137 tmp = _mm512_max_epi32(max_e_vec,
ONE);
1138 tmp1 = _mm512_sub_epi32(rho_vec,
ONE);
1139 tmp1 = _mm512_and_epi32(rho_vec, tmp1);
1140 mask = _mm512_cmpneq_epi32_mask(tmp1,
ZERO);
1141 kappa_vec = _mm512_mask_mov_epi32(
ONE, mask, tmp);
1146 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1147 auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
1149 prev_cq_vec = _mm512_mask_permutexvar_epi32(
ZERO, 0x1, left_shift,
1152 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1153 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1157 auto uq_vec = _mm512_max_epi32(kappa_vec, e_qmax_vec);
1158 auto u_q_vec = _mm512_sub_epi32(uq_vec, kappa_vec);
1160 auto eps_vec =
cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1161 __m512i tuple_vec =
cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1162 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1164 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1177 tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1178 _mm512_store_epi32(tuple, tuple_vec);
1179 _mm512_store_epi32(u_q, u_q_vec);
1180 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1183 tmp = _mm512_permutexvar_epi32(right_shift, cx_val_vec[0]);
1184 tmp = _mm512_slli_epi32(tmp, 2);
1185 prev_cq_vec = _mm512_maskz_add_epi32(0x1, tmp, cx_val_vec[0]);
1197 lengths[0] = mel.
pos + vlc.
pos + ms.
pos;
1205 coded->
buf[lengths[0]-1] = (
ui8)(num_bytes >> 4);
1206 coded->
buf[lengths[0]-2] = coded->
buf[lengths[0]-2] & 0xF0;
1207 coded->
buf[lengths[0]-2] =
1208 (
ui8)(coded->
buf[lengths[0]-2] | (num_bytes & 0xF));