40#include <wasm_simd128.h>
55 v128_t c = wasm_f32x4_ge(a, zero);
56 v128_t p = wasm_f32x4_add(a, half);
57 v128_t n = wasm_f32x4_sub(a, half);
58 v128_t d = wasm_v128_and(c, p);
59 v128_t e = wasm_v128_andnot(n, c);
60 v128_t v = wasm_v128_or(d, e);
61 return wasm_i32x4_trunc_sat_f32x4(v);
66 const ui32 src_line_offset,
68 const ui32 dst_line_offset,
75 const si32 *sp = src_line->
i32 + src_line_offset;
76 si32 *dp = dst_line->
i32 + dst_line_offset;
77 v128_t sh = wasm_i32x4_splat((
si32)shift);
78 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
80 v128_t s = wasm_v128_load(sp);
81 s = wasm_i32x4_add(s, sh);
82 wasm_v128_store(dp, s);
87 const si32 *sp = src_line->
i32 + src_line_offset;
88 si64 *dp = dst_line->
i64 + dst_line_offset;
89 v128_t sh = wasm_i64x2_splat(shift);
90 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
93 s = wasm_v128_load(sp);
95 t = wasm_i64x2_extend_low_i32x4(s);
96 t = wasm_i64x2_add(t, sh);
97 wasm_v128_store(dp, t);
99 t = wasm_i64x2_extend_high_i32x4(s);
100 t = wasm_i64x2_add(t, sh);
101 wasm_v128_store(dp + 2, t);
109 const si64 *sp = src_line->
i64 + src_line_offset;
110 si32 *dp = dst_line->
i32 + dst_line_offset;
111 v128_t sh = wasm_i64x2_splat(shift);
112 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
115 s0 = wasm_v128_load(sp);
116 s0 = wasm_i64x2_add(s0, sh);
117 s1 = wasm_v128_load(sp + 2);
118 s1 = wasm_i64x2_add(s1, sh);
119 s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
120 wasm_v128_store(dp, s0);
127 const ui32 src_line_offset,
129 const ui32 dst_line_offset,
136 const si32 *sp = src_line->
i32 + src_line_offset;
137 si32 *dp = dst_line->
i32 + dst_line_offset;
138 v128_t sh = wasm_i32x4_splat((
si32)(-shift));
139 v128_t zero = wasm_i32x4_splat(0);
140 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
142 v128_t s = wasm_v128_load(sp);
143 v128_t c = wasm_i32x4_lt(s, zero);
144 v128_t v_m_sh = wasm_i32x4_sub(sh, s);
145 v_m_sh = wasm_v128_and(c, v_m_sh);
146 s = wasm_v128_andnot(s, c);
147 s = wasm_v128_or(s, v_m_sh);
148 wasm_v128_store(dp, s);
153 const si32 *sp = src_line->
i32 + src_line_offset;
154 si64 *dp = dst_line->
i64 + dst_line_offset;
155 v128_t sh = wasm_i64x2_splat(-shift);
156 v128_t zero = wasm_i32x4_splat(0);
157 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
159 v128_t s, u, c, v_m_sh;
160 s = wasm_v128_load(sp);
162 u = wasm_i64x2_extend_low_i32x4(s);
163 c = wasm_i64x2_lt(u, zero);
164 v_m_sh = wasm_i64x2_sub(sh, u);
165 v_m_sh = wasm_v128_and(c, v_m_sh);
166 u = wasm_v128_andnot(u, c);
167 u = wasm_v128_or(u, v_m_sh);
169 wasm_v128_store(dp, u);
171 u = wasm_i64x2_extend_high_i32x4(s);
172 c = wasm_i64x2_lt(u, zero);
173 v_m_sh = wasm_i64x2_sub(sh, u);
174 v_m_sh = wasm_v128_and(c, v_m_sh);
175 u = wasm_v128_andnot(u, c);
176 u = wasm_v128_or(u, v_m_sh);
178 wasm_v128_store(dp + 2, u);
186 const si64 *sp = src_line->
i64 + src_line_offset;
187 si32 *dp = dst_line->
i32 + dst_line_offset;
188 v128_t sh = wasm_i64x2_splat(-shift);
189 v128_t zero = wasm_i32x4_splat(0);
190 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
194 v128_t s, t0, t1, p, n, m, tm;
195 s = wasm_v128_load(sp);
196 m = wasm_i64x2_lt(s, zero);
197 tm = wasm_i64x2_sub(sh, s);
198 n = wasm_v128_and(m, tm);
199 p = wasm_v128_andnot(s, m);
200 t0 = wasm_v128_or(n, p);
202 s = wasm_v128_load(sp + 2);
203 m = wasm_i64x2_lt(s, zero);
204 tm = wasm_i64x2_sub(sh, s);
205 n = wasm_v128_and(m, tm);
206 p = wasm_v128_andnot(s, m);
207 t1 = wasm_v128_or(n, p);
209 t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
210 wasm_v128_store(dp, t0);
219 v128_t shift = wasm_f32x4_splat(0.5f);
220 v128_t m = wasm_f32x4_splat(mul);
221 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
223 v128_t t = wasm_v128_load(sp);
224 v128_t s = wasm_f32x4_convert_i32x4(t);
225 s = wasm_f32x4_mul(s, m);
226 s = wasm_f32x4_sub(s, shift);
227 wasm_v128_store(dp, s);
235 v128_t m = wasm_f32x4_splat(mul);
236 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
238 v128_t t = wasm_v128_load(sp);
239 v128_t s = wasm_f32x4_convert_i32x4(t);
240 s = wasm_f32x4_mul(s, m);
241 wasm_v128_store(dp, s);
249 const v128_t zero = wasm_f32x4_splat(0.0f);
250 const v128_t half = wasm_f32x4_splat(0.5f);
251 v128_t m = wasm_f32x4_splat(mul);
252 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
254 v128_t t = wasm_v128_load(sp);
255 v128_t s = wasm_f32x4_add(t, half);
256 s = wasm_f32x4_mul(s, m);
257 s = wasm_f32x4_add(s, half);
266 const v128_t zero = wasm_f32x4_splat(0.0f);
267 const v128_t half = wasm_f32x4_splat(0.5f);
268 v128_t m = wasm_f32x4_splat(mul);
269 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
271 v128_t t = wasm_v128_load(sp);
272 v128_t s = wasm_f32x4_mul(t, m);
273 s = wasm_f32x4_add(s, half);
282 v128_t c = wasm_f32x4_ge(x, y);
283 v128_t d = wasm_v128_and(c, a);
284 v128_t e = wasm_v128_andnot(b, c);
285 return wasm_v128_or(d, e);
292 v128_t c = wasm_f32x4_lt(x, y);
293 v128_t d = wasm_v128_and(c, a);
294 v128_t e = wasm_v128_andnot(b, c);
295 return wasm_v128_or(d, e);
299 template <
bool NLT_TYPE3>
303 ui32 bit_depth,
bool is_signed,
ui32 width)
310 assert(bit_depth <= 32);
311 const float* sp = src_line->
f32;
312 si32* dp = dst_line->
i32 + dst_line_offset;
319 si32 neg_limit = (
si32)INT_MIN >> (32 - bit_depth);
320 v128_t mul = wasm_f32x4_splat((
float)(1ull << bit_depth));
321 v128_t fl_up_lim = wasm_f32x4_splat(-(
float)neg_limit);
322 v128_t fl_low_lim = wasm_f32x4_splat((
float)neg_limit);
323 v128_t s32_up_lim = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
324 v128_t s32_low_lim = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
328 const v128_t zero = wasm_f32x4_splat(0.0f);
329 const v128_t half = wasm_f32x4_splat(0.5f);
330 v128_t bias = wasm_i32x4_splat(-(
si32)((1ULL << (bit_depth - 1)) + 1));
331 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
332 v128_t t = wasm_v128_load(sp);
333 t = wasm_f32x4_mul(t, mul);
339 v128_t c = wasm_i32x4_gt(zero, u);
340 v128_t neg = wasm_i32x4_sub(bias, u);
341 neg = wasm_v128_and(c, neg);
342 u = wasm_v128_andnot(u, c);
343 u = wasm_v128_or(neg, u);
345 wasm_v128_store(dp, u);
350 const v128_t zero = wasm_f32x4_splat(0.0f);
351 const v128_t half = wasm_f32x4_splat(0.5f);
352 v128_t ihalf = wasm_i32x4_splat((
si32)(1ULL << (bit_depth - 1)));
353 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
354 v128_t t = wasm_v128_load(sp);
355 t = wasm_f32x4_mul(t, mul);
359 u = wasm_i32x4_add(u, ihalf);
360 wasm_v128_store(dp, u);
368 ui32 bit_depth,
bool is_signed,
ui32 width)
371 dst_line_offset, bit_depth, is_signed, width);
377 ui32 bit_depth,
bool is_signed,
ui32 width)
380 dst_line_offset, bit_depth, is_signed, width);
384 template <
bool NLT_TYPE3>
388 ui32 bit_depth,
bool is_signed,
ui32 width)
395 assert(bit_depth <= 32);
396 v128_t mul = wasm_f32x4_splat((
float)(1.0 / (
double)(1ULL << bit_depth)));
398 const si32* sp = src_line->
i32 + src_line_offset;
399 float* dp = dst_line->
f32;
402 v128_t zero = wasm_i32x4_splat(0);
403 v128_t bias = wasm_i32x4_splat(-(
si32)((1ULL << (bit_depth - 1)) + 1));
404 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
405 v128_t t = wasm_v128_load(sp);
408 v128_t c = wasm_i32x4_lt(t, zero);
409 v128_t neg = wasm_i32x4_sub(bias, t);
410 neg = wasm_v128_and(c, neg);
411 c = wasm_v128_andnot(t, c);
412 t = wasm_v128_or(neg, c);
414 v128_t v = wasm_f32x4_convert_i32x4(t);
415 v = wasm_f32x4_mul(v, mul);
416 wasm_v128_store(dp, v);
421 v128_t half = wasm_i32x4_splat((
si32)(1ULL << (bit_depth - 1)));
422 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
423 v128_t t = wasm_v128_load(sp);
424 t = wasm_i32x4_sub(t, half);
425 v128_t v = wasm_f32x4_convert_i32x4(t);
426 v = wasm_f32x4_mul(v, mul);
427 wasm_v128_store(dp, v);
435 ui32 bit_depth,
bool is_signed,
ui32 width)
438 dst_line, bit_depth, is_signed, width);
444 ui32 bit_depth,
bool is_signed,
ui32 width)
447 dst_line, bit_depth, is_signed, width);
475 for (
int i = (repeat + 3) >> 2; i > 0; --i)
477 v128_t mr = wasm_v128_load(rp);
478 v128_t mg = wasm_v128_load(gp);
479 v128_t mb = wasm_v128_load(bp);
480 v128_t t = wasm_i32x4_add(mr, mb);
481 t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
482 wasm_v128_store(yp, wasm_i32x4_shr(t, 2));
483 t = wasm_i32x4_sub(mb, mg);
484 wasm_v128_store(cbp, t);
485 t = wasm_i32x4_sub(mr, mg);
486 wasm_v128_store(crp, t);
488 rp += 4; gp += 4; bp += 4;
489 yp += 4; cbp += 4; crp += 4;
502 for (
int i = (repeat + 3) >> 2; i > 0; --i)
504 v128_t mr32 = wasm_v128_load(rp);
505 v128_t mg32 = wasm_v128_load(gp);
506 v128_t mb32 = wasm_v128_load(bp);
507 v128_t mr, mg, mb, t;
508 mr = wasm_i64x2_extend_low_i32x4(mr32);
509 mg = wasm_i64x2_extend_low_i32x4(mg32);
510 mb = wasm_i64x2_extend_low_i32x4(mb32);
512 t = wasm_i64x2_add(mr, mb);
513 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
514 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
515 t = wasm_i64x2_sub(mb, mg);
516 wasm_v128_store(cbp, t);
517 t = wasm_i64x2_sub(mr, mg);
518 wasm_v128_store(crp, t);
520 yp += 2; cbp += 2; crp += 2;
522 mr = wasm_i64x2_extend_high_i32x4(mr32);
523 mg = wasm_i64x2_extend_high_i32x4(mg32);
524 mb = wasm_i64x2_extend_high_i32x4(mb32);
526 t = wasm_i64x2_add(mr, mb);
527 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
528 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
529 t = wasm_i64x2_sub(mb, mg);
530 wasm_v128_store(cbp, t);
531 t = wasm_i64x2_sub(mr, mg);
532 wasm_v128_store(crp, t);
534 rp += 4; gp += 4; bp += 4;
535 yp += 2; cbp += 2; crp += 2;
564 for (
int i = (repeat + 3) >> 2; i > 0; --i)
566 v128_t my = wasm_v128_load(yp);
567 v128_t mcb = wasm_v128_load(cbp);
568 v128_t mcr = wasm_v128_load(crp);
570 v128_t t = wasm_i32x4_add(mcb, mcr);
571 t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
572 wasm_v128_store(gp, t);
573 v128_t u = wasm_i32x4_add(mcb, t);
574 wasm_v128_store(bp, u);
575 u = wasm_i32x4_add(mcr, t);
576 wasm_v128_store(rp, u);
578 yp += 4; cbp += 4; crp += 4;
579 rp += 4; gp += 4; bp += 4;
592 for (
int i = (repeat + 3) >> 2; i > 0; --i)
594 v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
595 my = wasm_v128_load(yp);
596 mcb = wasm_v128_load(cbp);
597 mcr = wasm_v128_load(crp);
599 tg0 = wasm_i64x2_add(mcb, mcr);
600 tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2));
601 tb0 = wasm_i64x2_add(mcb, tg0);
602 tr0 = wasm_i64x2_add(mcr, tg0);
604 yp += 2; cbp += 2; crp += 2;
606 my = wasm_v128_load(yp);
607 mcb = wasm_v128_load(cbp);
608 mcr = wasm_v128_load(crp);
610 tg1 = wasm_i64x2_add(mcb, mcr);
611 tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2));
612 tb1 = wasm_i64x2_add(mcb, tg1);
613 tr1 = wasm_i64x2_add(mcr, tg1);
615 tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
616 tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
617 tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
619 wasm_v128_store(rp, tr0);
620 wasm_v128_store(gp, tg0);
621 wasm_v128_store(bp, tb0);
623 yp += 2; cbp += 2; crp += 2;
624 rp += 4; gp += 4; bp += 4;
631 float *y,
float *cb,
float *cr,
ui32 repeat)
638 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
640 v128_t mr = wasm_v128_load(r);
641 v128_t mb = wasm_v128_load(b);
642 v128_t my = wasm_f32x4_mul(alpha_rf, mr);
643 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_gf, wasm_v128_load(g)));
644 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_bf, mb));
645 wasm_v128_store(y, my);
646 wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my)));
647 wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my)));
649 r += 4; g += 4; b += 4;
650 y += 4; cb += 4; cr += 4;
656 float *r,
float *g,
float *b,
ui32 repeat)
662 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
664 v128_t my = wasm_v128_load(y);
665 v128_t mcr = wasm_v128_load(cr);
666 v128_t mcb = wasm_v128_load(cb);
667 v128_t mg = wasm_f32x4_sub(my, wasm_f32x4_mul(gamma_cr2g, mcr));
668 wasm_v128_store(g, wasm_f32x4_sub(mg, wasm_f32x4_mul(gamma_cb2g, mcb)));
669 wasm_v128_store(r, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cr2r, mcr)));
670 wasm_v128_store(b, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cb2b, mcb)));
672 y += 4; cb += 4; cr += 4;
673 r += 4; g += 4; b += 4;
void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void wasm_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static void local_wasm_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
static v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void wasm_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
static v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
void wasm_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
static void local_wasm_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
static v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half)
void wasm_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF