OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_colour_wasm.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2021, Aous Naman
6// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2021, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_wasm.cpp
34// Author: Aous Naman
35// Date: 9 February 2021
36//***************************************************************************/
37
38#include <climits>
39#include <cmath>
40#include <wasm_simd128.h>
41
42#include "ojph_defs.h"
43#include "ojph_mem.h"
44#include "ojph_colour.h"
45#include "ojph_colour_local.h"
46
47namespace ojph {
48 namespace local {
49
51 static inline
52 v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half)
53 { // We implement ojph_round, which is
54 // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float
55 v128_t c = wasm_f32x4_ge(a, zero); // greater or equal to zero
56 v128_t p = wasm_f32x4_add(a, half); // for positive, add half
57 v128_t n = wasm_f32x4_sub(a, half); // for negative, subtract half
58 v128_t d = wasm_v128_and(c, p); // keep positive only
59 v128_t e = wasm_v128_andnot(n, c); // keep negative only
60 v128_t v = wasm_v128_or(d, e); // combine
61 return wasm_i32x4_trunc_sat_f32x4(v);// truncate (towards 0)
62 }
63
65 void wasm_rev_convert(const line_buf *src_line,
66 const ui32 src_line_offset,
67 line_buf *dst_line,
68 const ui32 dst_line_offset,
69 si64 shift, ui32 width)
70 {
71 if (src_line->flags & line_buf::LFT_32BIT)
72 {
73 if (dst_line->flags & line_buf::LFT_32BIT)
74 {
75 const si32 *sp = src_line->i32 + src_line_offset;
76 si32 *dp = dst_line->i32 + dst_line_offset;
77 v128_t sh = wasm_i32x4_splat((si32)shift);
78 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
79 {
80 v128_t s = wasm_v128_load(sp);
81 s = wasm_i32x4_add(s, sh);
82 wasm_v128_store(dp, s);
83 }
84 }
85 else
86 {
87 const si32 *sp = src_line->i32 + src_line_offset;
88 si64 *dp = dst_line->i64 + dst_line_offset;
89 v128_t sh = wasm_i64x2_splat(shift);
90 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
91 {
92 v128_t s, t;
93 s = wasm_v128_load(sp);
94
95 t = wasm_i64x2_extend_low_i32x4(s);
96 t = wasm_i64x2_add(t, sh);
97 wasm_v128_store(dp, t);
98
99 t = wasm_i64x2_extend_high_i32x4(s);
100 t = wasm_i64x2_add(t, sh);
101 wasm_v128_store(dp + 2, t);
102 }
103 }
104 }
105 else
106 {
107 assert(src_line->flags | line_buf::LFT_64BIT);
108 assert(dst_line->flags | line_buf::LFT_32BIT);
109 const si64 *sp = src_line->i64 + src_line_offset;
110 si32 *dp = dst_line->i32 + dst_line_offset;
111 v128_t sh = wasm_i64x2_splat(shift);
112 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
113 {
114 v128_t s0, s1;
115 s0 = wasm_v128_load(sp);
116 s0 = wasm_i64x2_add(s0, sh);
117 s1 = wasm_v128_load(sp + 2);
118 s1 = wasm_i64x2_add(s1, sh);
119 s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
120 wasm_v128_store(dp, s0);
121 }
122 }
123 }
124
127 const ui32 src_line_offset,
128 line_buf *dst_line,
129 const ui32 dst_line_offset,
130 si64 shift, ui32 width)
131 {
132 if (src_line->flags & line_buf::LFT_32BIT)
133 {
134 if (dst_line->flags & line_buf::LFT_32BIT)
135 {
136 const si32 *sp = src_line->i32 + src_line_offset;
137 si32 *dp = dst_line->i32 + dst_line_offset;
138 v128_t sh = wasm_i32x4_splat((si32)(-shift));
139 v128_t zero = wasm_i32x4_splat(0);
140 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
141 {
142 v128_t s = wasm_v128_load(sp);
143 v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value
144 v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value
145 v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
146 s = wasm_v128_andnot(s, c); // keep only +ve or 0
147 s = wasm_v128_or(s, v_m_sh); // combine
148 wasm_v128_store(dp, s);
149 }
150 }
151 else
152 {
153 const si32 *sp = src_line->i32 + src_line_offset;
154 si64 *dp = dst_line->i64 + dst_line_offset;
155 v128_t sh = wasm_i64x2_splat(-shift);
156 v128_t zero = wasm_i32x4_splat(0);
157 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
158 {
159 v128_t s, u, c, v_m_sh;
160 s = wasm_v128_load(sp);
161
162 u = wasm_i64x2_extend_low_i32x4(s);
163 c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value
164 v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value
165 v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
166 u = wasm_v128_andnot(u, c); // keep only +ve or 0
167 u = wasm_v128_or(u, v_m_sh); // combine
168
169 wasm_v128_store(dp, u);
170
171 u = wasm_i64x2_extend_high_i32x4(s);
172 c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value
173 v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value
174 v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
175 u = wasm_v128_andnot(u, c); // keep only +ve or 0
176 u = wasm_v128_or(u, v_m_sh); // combine
177
178 wasm_v128_store(dp + 2, u);
179 }
180 }
181 }
182 else
183 {
184 assert(src_line->flags | line_buf::LFT_64BIT);
185 assert(dst_line->flags | line_buf::LFT_32BIT);
186 const si64 *sp = src_line->i64 + src_line_offset;
187 si32 *dp = dst_line->i32 + dst_line_offset;
188 v128_t sh = wasm_i64x2_splat(-shift);
189 v128_t zero = wasm_i32x4_splat(0);
190 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
191 {
192 // s for source, t for target, p for positive, n for negative,
193 // m for mask, and tm for temp
194 v128_t s, t0, t1, p, n, m, tm;
195 s = wasm_v128_load(sp);
196 m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value
197 tm = wasm_i64x2_sub(sh, s); // - shift - value
198 n = wasm_v128_and(m, tm); // -ve
199 p = wasm_v128_andnot(s, m); // +ve
200 t0 = wasm_v128_or(n, p);
201
202 s = wasm_v128_load(sp + 2);
203 m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value
204 tm = wasm_i64x2_sub(sh, s); // - shift - value
205 n = wasm_v128_and(m, tm); // -ve
206 p = wasm_v128_andnot(s, m); // +ve
207 t1 = wasm_v128_or(n, p);
208
209 t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
210 wasm_v128_store(dp, t0);
211 }
212 }
213 }
214
216 void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
217 ui32 width)
218 {
219 v128_t shift = wasm_f32x4_splat(0.5f);
220 v128_t m = wasm_f32x4_splat(mul);
221 for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
222 {
223 v128_t t = wasm_v128_load(sp);
224 v128_t s = wasm_f32x4_convert_i32x4(t);
225 s = wasm_f32x4_mul(s, m);
226 s = wasm_f32x4_sub(s, shift);
227 wasm_v128_store(dp, s);
228 }
229 }
230
232 void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
233 ui32 width)
234 {
235 v128_t m = wasm_f32x4_splat(mul);
236 for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
237 {
238 v128_t t = wasm_v128_load(sp);
239 v128_t s = wasm_f32x4_convert_i32x4(t);
240 s = wasm_f32x4_mul(s, m);
241 wasm_v128_store(dp, s);
242 }
243 }
244
246 void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
247 ui32 width)
248 {
249 const v128_t zero = wasm_f32x4_splat(0.0f);
250 const v128_t half = wasm_f32x4_splat(0.5f);
251 v128_t m = wasm_f32x4_splat(mul);
252 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
253 {
254 v128_t t = wasm_v128_load(sp);
255 v128_t s = wasm_f32x4_add(t, half);
256 s = wasm_f32x4_mul(s, m);
257 s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next
258 wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half));
259 }
260 }
261
263 void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
264 ui32 width)
265 {
266 const v128_t zero = wasm_f32x4_splat(0.0f);
267 const v128_t half = wasm_f32x4_splat(0.5f);
268 v128_t m = wasm_f32x4_splat(mul);
269 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
270 {
271 v128_t t = wasm_v128_load(sp);
272 v128_t s = wasm_f32x4_mul(t, m);
273 s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next
274 wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half));
275 }
276 }
277
279 static inline
280 v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
281 {
282 v128_t c = wasm_f32x4_ge(x, y); // 0xFFFFFFFF for x >= y
283 v128_t d = wasm_v128_and(c, a); // keep only a, where x >= y
284 v128_t e = wasm_v128_andnot(b, c); // keep only b, where x < y
285 return wasm_v128_or(d, e); // combine
286 }
287
289 static inline
290 v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
291 {
292 v128_t c = wasm_f32x4_lt(x, y); // 0xFFFFFFFF for x < y
293 v128_t d = wasm_v128_and(c, a); // keep only a, where x < y
294 v128_t e = wasm_v128_andnot(b, c); // keep only b, where x >= y
295 return wasm_v128_or(d, e); // combine
296 }
297
299 template <bool NLT_TYPE3>
300 static inline
302 line_buf *dst_line, ui32 dst_line_offset,
303 ui32 bit_depth, bool is_signed, ui32 width)
304 {
305 assert((src_line->flags & line_buf::LFT_32BIT) &&
306 (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
307 (dst_line->flags & line_buf::LFT_32BIT) &&
308 (dst_line->flags & line_buf::LFT_INTEGER));
309
310 assert(bit_depth <= 32);
311 const float* sp = src_line->f32;
312 si32* dp = dst_line->i32 + dst_line_offset;
313 // There is the possibility that converting to integer will
314 // exceed the dynamic range of 32bit integer; therefore, care must be
315 // exercised.
316 // We look if the floating point number is outside the half-closed
317 // interval [-0.5f, 0.5f). If so, we limit the resulting integer
318 // to the maximum/minimum that number supports.
319 si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
320 v128_t mul = wasm_f32x4_splat((float)(1ull << bit_depth));
321 v128_t fl_up_lim = wasm_f32x4_splat(-(float)neg_limit); // val < upper
322 v128_t fl_low_lim = wasm_f32x4_splat((float)neg_limit); // val >= lower
323 v128_t s32_up_lim = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
324 v128_t s32_low_lim = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
325
326 if (is_signed)
327 {
328 const v128_t zero = wasm_f32x4_splat(0.0f);
329 const v128_t half = wasm_f32x4_splat(0.5f);
330 v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
331 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
332 v128_t t = wasm_v128_load(sp);
333 t = wasm_f32x4_mul(t, mul);
334 v128_t u = ojph_convert_float_to_i32(t, zero, half);
335 u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
336 u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
337 if (NLT_TYPE3)
338 {
339 v128_t c = wasm_i32x4_gt(zero, u); // 0xFFFFFFFF for -ve value
340 v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value
341 neg = wasm_v128_and(c, neg); // keep only - bias - value
342 u = wasm_v128_andnot(u, c); // keep only +ve or 0
343 u = wasm_v128_or(neg, u); // combine
344 }
345 wasm_v128_store(dp, u);
346 }
347 }
348 else
349 {
350 const v128_t zero = wasm_f32x4_splat(0.0f);
351 const v128_t half = wasm_f32x4_splat(0.5f);
352 v128_t ihalf = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
353 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
354 v128_t t = wasm_v128_load(sp);
355 t = wasm_f32x4_mul(t, mul);
356 v128_t u = ojph_convert_float_to_i32(t, zero, half);
357 u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
358 u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
359 u = wasm_i32x4_add(u, ihalf);
360 wasm_v128_store(dp, u);
361 }
362 }
363 }
364
367 line_buf *dst_line, ui32 dst_line_offset,
368 ui32 bit_depth, bool is_signed, ui32 width)
369 {
371 dst_line_offset, bit_depth, is_signed, width);
372 }
373
376 line_buf *dst_line, ui32 dst_line_offset,
377 ui32 bit_depth, bool is_signed, ui32 width)
378 {
379 local_wasm_irv_convert_to_integer<true>(src_line, dst_line,
380 dst_line_offset, bit_depth, is_signed, width);
381 }
382
384 template <bool NLT_TYPE3>
385 static inline
387 ui32 src_line_offset, line_buf *dst_line,
388 ui32 bit_depth, bool is_signed, ui32 width)
389 {
390 assert((src_line->flags & line_buf::LFT_32BIT) &&
391 (src_line->flags & line_buf::LFT_INTEGER) &&
392 (dst_line->flags & line_buf::LFT_32BIT) &&
393 (dst_line->flags & line_buf::LFT_INTEGER) == 0);
394
395 assert(bit_depth <= 32);
396 v128_t mul = wasm_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth)));
397
398 const si32* sp = src_line->i32 + src_line_offset;
399 float* dp = dst_line->f32;
400 if (is_signed)
401 {
402 v128_t zero = wasm_i32x4_splat(0);
403 v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
404 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
405 v128_t t = wasm_v128_load(sp);
406 if (NLT_TYPE3)
407 {
408 v128_t c = wasm_i32x4_lt(t, zero); // 0xFFFFFFFF for -ve value
409 v128_t neg = wasm_i32x4_sub(bias, t); // - bias - value
410 neg = wasm_v128_and(c, neg); // keep only - bias - value
411 c = wasm_v128_andnot(t, c); // keep only +ve or 0
412 t = wasm_v128_or(neg, c); // combine
413 }
414 v128_t v = wasm_f32x4_convert_i32x4(t);
415 v = wasm_f32x4_mul(v, mul);
416 wasm_v128_store(dp, v);
417 }
418 }
419 else
420 {
421 v128_t half = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
422 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
423 v128_t t = wasm_v128_load(sp);
424 t = wasm_i32x4_sub(t, half);
425 v128_t v = wasm_f32x4_convert_i32x4(t);
426 v = wasm_f32x4_mul(v, mul);
427 wasm_v128_store(dp, v);
428 }
429 }
430 }
431
434 ui32 src_line_offset, line_buf *dst_line,
435 ui32 bit_depth, bool is_signed, ui32 width)
436 {
437 local_wasm_irv_convert_to_float<false>(src_line, src_line_offset,
438 dst_line, bit_depth, is_signed, width);
439 }
440
443 ui32 src_line_offset, line_buf *dst_line,
444 ui32 bit_depth, bool is_signed, ui32 width)
445 {
446 local_wasm_irv_convert_to_float<true>(src_line, src_line_offset,
447 dst_line, bit_depth, is_signed, width);
448 }
449
452 const line_buf *g,
453 const line_buf *b,
454 line_buf *y, line_buf *cb, line_buf *cr,
455 ui32 repeat)
456 {
457 assert((y->flags & line_buf::LFT_INTEGER) &&
463
464 if (y->flags & line_buf::LFT_32BIT)
465 {
466 assert((y->flags & line_buf::LFT_32BIT) &&
467 (cb->flags & line_buf::LFT_32BIT) &&
468 (cr->flags & line_buf::LFT_32BIT) &&
469 (r->flags & line_buf::LFT_32BIT) &&
470 (g->flags & line_buf::LFT_32BIT) &&
472 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
473 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
474
475 for (int i = (repeat + 3) >> 2; i > 0; --i)
476 {
477 v128_t mr = wasm_v128_load(rp);
478 v128_t mg = wasm_v128_load(gp);
479 v128_t mb = wasm_v128_load(bp);
480 v128_t t = wasm_i32x4_add(mr, mb);
481 t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
482 wasm_v128_store(yp, wasm_i32x4_shr(t, 2));
483 t = wasm_i32x4_sub(mb, mg);
484 wasm_v128_store(cbp, t);
485 t = wasm_i32x4_sub(mr, mg);
486 wasm_v128_store(crp, t);
487
488 rp += 4; gp += 4; bp += 4;
489 yp += 4; cbp += 4; crp += 4;
490 }
491 }
492 else
493 {
494 assert((y->flags & line_buf::LFT_64BIT) &&
495 (cb->flags & line_buf::LFT_64BIT) &&
496 (cr->flags & line_buf::LFT_64BIT) &&
497 (r->flags & line_buf::LFT_32BIT) &&
498 (g->flags & line_buf::LFT_32BIT) &&
500 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
501 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
502 for (int i = (repeat + 3) >> 2; i > 0; --i)
503 {
504 v128_t mr32 = wasm_v128_load(rp);
505 v128_t mg32 = wasm_v128_load(gp);
506 v128_t mb32 = wasm_v128_load(bp);
507 v128_t mr, mg, mb, t;
508 mr = wasm_i64x2_extend_low_i32x4(mr32);
509 mg = wasm_i64x2_extend_low_i32x4(mg32);
510 mb = wasm_i64x2_extend_low_i32x4(mb32);
511
512 t = wasm_i64x2_add(mr, mb);
513 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
514 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
515 t = wasm_i64x2_sub(mb, mg);
516 wasm_v128_store(cbp, t);
517 t = wasm_i64x2_sub(mr, mg);
518 wasm_v128_store(crp, t);
519
520 yp += 2; cbp += 2; crp += 2;
521
522 mr = wasm_i64x2_extend_high_i32x4(mr32);
523 mg = wasm_i64x2_extend_high_i32x4(mg32);
524 mb = wasm_i64x2_extend_high_i32x4(mb32);
525
526 t = wasm_i64x2_add(mr, mb);
527 t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
528 wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
529 t = wasm_i64x2_sub(mb, mg);
530 wasm_v128_store(cbp, t);
531 t = wasm_i64x2_sub(mr, mg);
532 wasm_v128_store(crp, t);
533
534 rp += 4; gp += 4; bp += 4;
535 yp += 2; cbp += 2; crp += 2;
536 }
537 }
538 }
539
542 const line_buf *cb,
543 const line_buf *cr,
544 line_buf *r, line_buf *g, line_buf *b,
545 ui32 repeat)
546 {
547 assert((y->flags & line_buf::LFT_INTEGER) &&
553
554 if (y->flags & line_buf::LFT_32BIT)
555 {
556 assert((y->flags & line_buf::LFT_32BIT) &&
557 (cb->flags & line_buf::LFT_32BIT) &&
558 (cr->flags & line_buf::LFT_32BIT) &&
559 (r->flags & line_buf::LFT_32BIT) &&
560 (g->flags & line_buf::LFT_32BIT) &&
562 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
563 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
564 for (int i = (repeat + 3) >> 2; i > 0; --i)
565 {
566 v128_t my = wasm_v128_load(yp);
567 v128_t mcb = wasm_v128_load(cbp);
568 v128_t mcr = wasm_v128_load(crp);
569
570 v128_t t = wasm_i32x4_add(mcb, mcr);
571 t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
572 wasm_v128_store(gp, t);
573 v128_t u = wasm_i32x4_add(mcb, t);
574 wasm_v128_store(bp, u);
575 u = wasm_i32x4_add(mcr, t);
576 wasm_v128_store(rp, u);
577
578 yp += 4; cbp += 4; crp += 4;
579 rp += 4; gp += 4; bp += 4;
580 }
581 }
582 else
583 {
584 assert((y->flags & line_buf::LFT_64BIT) &&
585 (cb->flags & line_buf::LFT_64BIT) &&
586 (cr->flags & line_buf::LFT_64BIT) &&
587 (r->flags & line_buf::LFT_32BIT) &&
588 (g->flags & line_buf::LFT_32BIT) &&
590 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
591 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
592 for (int i = (repeat + 3) >> 2; i > 0; --i)
593 {
594 v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
595 my = wasm_v128_load(yp);
596 mcb = wasm_v128_load(cbp);
597 mcr = wasm_v128_load(crp);
598
599 tg0 = wasm_i64x2_add(mcb, mcr);
600 tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2));
601 tb0 = wasm_i64x2_add(mcb, tg0);
602 tr0 = wasm_i64x2_add(mcr, tg0);
603
604 yp += 2; cbp += 2; crp += 2;
605
606 my = wasm_v128_load(yp);
607 mcb = wasm_v128_load(cbp);
608 mcr = wasm_v128_load(crp);
609
610 tg1 = wasm_i64x2_add(mcb, mcr);
611 tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2));
612 tb1 = wasm_i64x2_add(mcb, tg1);
613 tr1 = wasm_i64x2_add(mcr, tg1);
614
615 tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
616 tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
617 tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
618
619 wasm_v128_store(rp, tr0);
620 wasm_v128_store(gp, tg0);
621 wasm_v128_store(bp, tb0);
622
623 yp += 2; cbp += 2; crp += 2;
624 rp += 4; gp += 4; bp += 4;
625 }
626 }
627 }
628
630 void wasm_ict_forward(const float *r, const float *g, const float *b,
631 float *y, float *cb, float *cr, ui32 repeat)
632 {
633 v128_t alpha_rf = wasm_f32x4_splat(CT_CNST::ALPHA_RF);
634 v128_t alpha_gf = wasm_f32x4_splat(CT_CNST::ALPHA_GF);
635 v128_t alpha_bf = wasm_f32x4_splat(CT_CNST::ALPHA_BF);
636 v128_t beta_cbf = wasm_f32x4_splat(CT_CNST::BETA_CbF);
637 v128_t beta_crf = wasm_f32x4_splat(CT_CNST::BETA_CrF);
638 for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
639 {
640 v128_t mr = wasm_v128_load(r);
641 v128_t mb = wasm_v128_load(b);
642 v128_t my = wasm_f32x4_mul(alpha_rf, mr);
643 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_gf, wasm_v128_load(g)));
644 my = wasm_f32x4_add(my, wasm_f32x4_mul(alpha_bf, mb));
645 wasm_v128_store(y, my);
646 wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my)));
647 wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my)));
648
649 r += 4; g += 4; b += 4;
650 y += 4; cb += 4; cr += 4;
651 }
652 }
653
655 void wasm_ict_backward(const float *y, const float *cb, const float *cr,
656 float *r, float *g, float *b, ui32 repeat)
657 {
658 v128_t gamma_cr2g = wasm_f32x4_splat(CT_CNST::GAMMA_CR2G);
659 v128_t gamma_cb2g = wasm_f32x4_splat(CT_CNST::GAMMA_CB2G);
660 v128_t gamma_cr2r = wasm_f32x4_splat(CT_CNST::GAMMA_CR2R);
661 v128_t gamma_cb2b = wasm_f32x4_splat(CT_CNST::GAMMA_CB2B);
662 for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
663 {
664 v128_t my = wasm_v128_load(y);
665 v128_t mcr = wasm_v128_load(cr);
666 v128_t mcb = wasm_v128_load(cb);
667 v128_t mg = wasm_f32x4_sub(my, wasm_f32x4_mul(gamma_cr2g, mcr));
668 wasm_v128_store(g, wasm_f32x4_sub(mg, wasm_f32x4_mul(gamma_cb2g, mcb)));
669 wasm_v128_store(r, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cr2r, mcr)));
670 wasm_v128_store(b, wasm_f32x4_add(my, wasm_f32x4_mul(gamma_cb2b, mcb)));
671
672 y += 4; cb += 4; cr += 4;
673 r += 4; g += 4; b += 4;
674 }
675 }
676
677 }
678}
float * f32
Definition ojph_mem.h:162
void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void wasm_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static void local_wasm_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
static v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void wasm_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
static v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
void wasm_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
static void local_wasm_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
static v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half)
void wasm_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF