OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_wasm.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2021, Aous Naman
6// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2021, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_wasm.cpp
34// Author: Aous Naman
35// Date: 09 February 2021
36//***************************************************************************/
37
38#include <cstdio>
39#include <wasm_simd128.h>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_params.h"
46
47#include "ojph_transform.h"
49
50namespace ojph {
51 namespace local {
52
54 static inline
55 void wasm_deinterleave32(float* dpl, float* dph, float* sp, int width)
56 {
57 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
58 {
59 v128_t a = wasm_v128_load(sp);
60 v128_t b = wasm_v128_load(sp + 4);
61 v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
62 v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
63 // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
64 // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
65 wasm_v128_store(dpl, c);
66 wasm_v128_store(dph, d);
67 }
68 }
69
71 static inline
72 void wasm_interleave32(float* dp, float* spl, float* sph, int width)
73 {
74 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
75 {
76 v128_t a = wasm_v128_load(spl);
77 v128_t b = wasm_v128_load(sph);
78 v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
79 v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
80 // v128_t c = _mm_unpacklo_ps(a, b);
81 // v128_t d = _mm_unpackhi_ps(a, b);
82 wasm_v128_store(dp, c);
83 wasm_v128_store(dp + 4, d);
84 }
85 }
86
88 static inline
89 void wasm_deinterleave64(double* dpl, double* dph, double* sp, int width)
90 {
91 for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
92 {
93 v128_t a = wasm_v128_load(sp);
94 v128_t b = wasm_v128_load(sp + 2);
95 v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
96 v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
97 wasm_v128_store(dpl, c);
98 wasm_v128_store(dph, d);
99 }
100 }
101
103 static inline
104 void wasm_interleave64(double* dp, double* spl, double* sph, int width)
105 {
106 for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
107 {
108 v128_t a = wasm_v128_load(spl);
109 v128_t b = wasm_v128_load(sph);
110 v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
111 v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
112 wasm_v128_store(dp, c);
113 wasm_v128_store(dp + 2, d);
114 }
115 }
116
118 static inline void wasm_multiply_const(float* p, float f, int width)
119 {
120 v128_t factor = wasm_f32x4_splat(f);
121 for (; width > 0; width -= 4, p += 4)
122 {
123 v128_t s = wasm_v128_load(p);
124 wasm_v128_store(p, wasm_f32x4_mul(factor, s));
125 }
126 }
127
129 void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig,
130 const line_buf* other, const line_buf* aug,
131 ui32 repeat, bool synthesis)
132 {
133 float a = s->irv.Aatk;
134 if (synthesis)
135 a = -a;
136
137 v128_t factor = wasm_f32x4_splat(a);
138
139 float* dst = aug->f32;
140 const float* src1 = sig->f32, * src2 = other->f32;
141 int i = (int)repeat;
142 for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
143 {
144 v128_t s1 = wasm_v128_load(src1);
145 v128_t s2 = wasm_v128_load(src2);
146 v128_t d = wasm_v128_load(dst);
147 d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
148 wasm_v128_store(dst, d);
149 }
150 }
151
153 void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
154 {
155 wasm_multiply_const(aug->f32, K, (int)repeat);
156 }
157
159 void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
160 const line_buf* hdst, const line_buf* src,
161 ui32 width, bool even)
162 {
163 if (width > 1)
164 {
165 // split src into ldst and hdst
166 {
167 float* dpl = even ? ldst->f32 : hdst->f32;
168 float* dph = even ? hdst->f32 : ldst->f32;
169 float* sp = src->f32;
170 int w = (int)width;
171 wasm_deinterleave32(dpl, dph, sp, w);
172 }
173
174 // the actual horizontal transform
175 float* hp = hdst->f32, * lp = ldst->f32;
176 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
177 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
178 ui32 num_steps = atk->get_num_steps();
179 for (ui32 j = num_steps; j > 0; --j)
180 {
181 const lifting_step* s = atk->get_step(j - 1);
182 const float a = s->irv.Aatk;
183
184 // extension
185 lp[-1] = lp[0];
186 lp[l_width] = lp[l_width - 1];
187 // lifting step
188 const float* sp = lp;
189 float* dp = hp;
190 int i = (int)h_width;
191 v128_t f = wasm_f32x4_splat(a);
192 if (even)
193 {
194 for (; i > 0; i -= 4, sp += 4, dp += 4)
195 {
196 v128_t m = wasm_v128_load(sp);
197 v128_t n = wasm_v128_load(sp + 1);
198 v128_t p = wasm_v128_load(dp);
199 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
200 wasm_v128_store(dp, p);
201 }
202 }
203 else
204 {
205 for (; i > 0; i -= 4, sp += 4, dp += 4)
206 {
207 v128_t m = wasm_v128_load(sp);
208 v128_t n = wasm_v128_load(sp - 1);
209 v128_t p = wasm_v128_load(dp);
210 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
211 wasm_v128_store(dp, p);
212 }
213 }
214
215 // swap buffers
216 float* t = lp; lp = hp; hp = t;
217 even = !even;
218 ui32 w = l_width; l_width = h_width; h_width = w;
219 }
220
221 { // multiply by K or 1/K
222 float K = atk->get_K();
223 float K_inv = 1.0f / K;
224 wasm_multiply_const(lp, K_inv, (int)l_width);
225 wasm_multiply_const(hp, K, (int)h_width);
226 }
227 }
228 else {
229 if (even)
230 ldst->f32[0] = src->f32[0];
231 else
232 hdst->f32[0] = src->f32[0] * 2.0f;
233 }
234 }
235
237 void wasm_irv_horz_syn(const param_atk* atk, const line_buf* dst,
238 const line_buf* lsrc, const line_buf* hsrc,
239 ui32 width, bool even)
240 {
241 if (width > 1)
242 {
243 bool ev = even;
244 float* oth = hsrc->f32, * aug = lsrc->f32;
245 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
246 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
247
248 { // multiply by K or 1/K
249 float K = atk->get_K();
250 float K_inv = 1.0f / K;
251 wasm_multiply_const(aug, K, (int)aug_width);
252 wasm_multiply_const(oth, K_inv, (int)oth_width);
253 }
254
255 // the actual horizontal transform
256 ui32 num_steps = atk->get_num_steps();
257 for (ui32 j = 0; j < num_steps; ++j)
258 {
259 const lifting_step* s = atk->get_step(j);
260 const float a = s->irv.Aatk;
261
262 // extension
263 oth[-1] = oth[0];
264 oth[oth_width] = oth[oth_width - 1];
265 // lifting step
266 const float* sp = oth;
267 float* dp = aug;
268 int i = (int)aug_width;
269 v128_t f = wasm_f32x4_splat(a);
270 if (ev)
271 {
272 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
273 {
274 v128_t m = wasm_v128_load(sp);
275 v128_t n = wasm_v128_load(sp - 1);
276 v128_t p = wasm_v128_load(dp);
277 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
278 wasm_v128_store(dp, p);
279 }
280 }
281 else
282 {
283 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
284 {
285 v128_t m = wasm_v128_load(sp);
286 v128_t n = wasm_v128_load(sp + 1);
287 v128_t p = wasm_v128_load(dp);
288 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
289 wasm_v128_store(dp, p);
290 }
291 }
292
293 // swap buffers
294 float* t = aug; aug = oth; oth = t;
295 ev = !ev;
296 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
297 }
298
299 // combine both lsrc and hsrc into dst
300 {
301 float* dp = dst->f32;
302 float* spl = even ? lsrc->f32 : hsrc->f32;
303 float* sph = even ? hsrc->f32 : lsrc->f32;
304 int w = (int)width;
305 wasm_interleave32(dp, spl, sph, w);
306 }
307 }
308 else {
309 if (even)
310 dst->f32[0] = lsrc->f32[0];
311 else
312 dst->f32[0] = hsrc->f32[0] * 0.5f;
313 }
314 }
315
317 void wasm_rev_vert_step32(const lifting_step* s, const line_buf* sig,
318 const line_buf* other, const line_buf* aug,
319 ui32 repeat, bool synthesis)
320 {
321 const si32 a = s->rev.Aatk;
322 const si32 b = s->rev.Batk;
323 const ui8 e = s->rev.Eatk;
324 v128_t va = wasm_i32x4_splat(a);
325 v128_t vb = wasm_i32x4_splat(b);
326
327 si32* dst = aug->i32;
328 const si32* src1 = sig->i32, * src2 = other->i32;
329 // The general definition of the wavelet in Part 2 is slightly
330 // different to part 2, although they are mathematically equivalent
331 // here, we identify the simpler form from Part 1 and employ them
332 if (a == 1)
333 { // 5/3 update and any case with a == 1
334 int i = (int)repeat;
335 if (synthesis)
336 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
337 {
338 v128_t s1 = wasm_v128_load((v128_t*)src1);
339 v128_t s2 = wasm_v128_load((v128_t*)src2);
340 v128_t d = wasm_v128_load((v128_t*)dst);
341 v128_t t = wasm_i32x4_add(s1, s2);
342 v128_t v = wasm_i32x4_add(vb, t);
343 v128_t w = wasm_i32x4_shr(v, e);
344 d = wasm_i32x4_sub(d, w);
345 wasm_v128_store((v128_t*)dst, d);
346 }
347 else
348 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
349 {
350 v128_t s1 = wasm_v128_load((v128_t*)src1);
351 v128_t s2 = wasm_v128_load((v128_t*)src2);
352 v128_t d = wasm_v128_load((v128_t*)dst);
353 v128_t t = wasm_i32x4_add(s1, s2);
354 v128_t v = wasm_i32x4_add(vb, t);
355 v128_t w = wasm_i32x4_shr(v, e);
356 d = wasm_i32x4_add(d, w);
357 wasm_v128_store((v128_t*)dst, d);
358 }
359 }
360 else if (a == -1 && b == 1 && e == 1)
361 { // 5/3 predict
362 int i = (int)repeat;
363 if (synthesis)
364 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
365 {
366 v128_t s1 = wasm_v128_load((v128_t*)src1);
367 v128_t s2 = wasm_v128_load((v128_t*)src2);
368 v128_t d = wasm_v128_load((v128_t*)dst);
369 v128_t t = wasm_i32x4_add(s1, s2);
370 v128_t w = wasm_i32x4_shr(t, e);
371 d = wasm_i32x4_add(d, w);
372 wasm_v128_store((v128_t*)dst, d);
373 }
374 else
375 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
376 {
377 v128_t s1 = wasm_v128_load((v128_t*)src1);
378 v128_t s2 = wasm_v128_load((v128_t*)src2);
379 v128_t d = wasm_v128_load((v128_t*)dst);
380 v128_t t = wasm_i32x4_add(s1, s2);
381 v128_t w = wasm_i32x4_shr(t, e);
382 d = wasm_i32x4_sub(d, w);
383 wasm_v128_store((v128_t*)dst, d);
384 }
385 }
386 else if (a == -1)
387 { // any case with a == -1, which is not 5/3 predict
388 int i = (int)repeat;
389 if (synthesis)
390 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
391 {
392 v128_t s1 = wasm_v128_load((v128_t*)src1);
393 v128_t s2 = wasm_v128_load((v128_t*)src2);
394 v128_t d = wasm_v128_load((v128_t*)dst);
395 v128_t t = wasm_i32x4_add(s1, s2);
396 v128_t v = wasm_i32x4_sub(vb, t);
397 v128_t w = wasm_i32x4_shr(v, e);
398 d = wasm_i32x4_sub(d, w);
399 wasm_v128_store((v128_t*)dst, d);
400 }
401 else
402 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
403 {
404 v128_t s1 = wasm_v128_load((v128_t*)src1);
405 v128_t s2 = wasm_v128_load((v128_t*)src2);
406 v128_t d = wasm_v128_load((v128_t*)dst);
407 v128_t t = wasm_i32x4_add(s1, s2);
408 v128_t v = wasm_i32x4_sub(vb, t);
409 v128_t w = wasm_i32x4_shr(v, e);
410 d = wasm_i32x4_add(d, w);
411 wasm_v128_store((v128_t*)dst, d);
412 }
413 }
414 else
415 { // general case
416 int i = (int)repeat;
417 if (synthesis)
418 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
419 {
420 v128_t s1 = wasm_v128_load((v128_t*)src1);
421 v128_t s2 = wasm_v128_load((v128_t*)src2);
422 v128_t d = wasm_v128_load((v128_t*)dst);
423 v128_t t = wasm_i32x4_add(s1, s2);
424 v128_t u = wasm_i32x4_mul(va, t);
425 v128_t v = wasm_i32x4_add(vb, u);
426 v128_t w = wasm_i32x4_shr(v, e);
427 d = wasm_i32x4_sub(d, w);
428 wasm_v128_store((v128_t*)dst, d);
429 }
430 else
431 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
432 {
433 v128_t s1 = wasm_v128_load((v128_t*)src1);
434 v128_t s2 = wasm_v128_load((v128_t*)src2);
435 v128_t d = wasm_v128_load((v128_t*)dst);
436 v128_t t = wasm_i32x4_add(s1, s2);
437 v128_t u = wasm_i32x4_mul(va, t);
438 v128_t v = wasm_i32x4_add(vb, u);
439 v128_t w = wasm_i32x4_shr(v, e);
440 d = wasm_i32x4_add(d, w);
441 wasm_v128_store((v128_t*)dst, d);
442 }
443 }
444 }
445
447 void wasm_rev_vert_step64(const lifting_step* s, const line_buf* sig,
448 const line_buf* other, const line_buf* aug,
449 ui32 repeat, bool synthesis)
450 {
451 const si32 a = s->rev.Aatk;
452 const si32 b = s->rev.Batk;
453 const ui8 e = s->rev.Eatk;
454 v128_t va = wasm_i64x2_splat(a);
455 v128_t vb = wasm_i64x2_splat(b);
456
457 si64* dst = aug->i64;
458 const si64* src1 = sig->i64, * src2 = other->i64;
459 // The general definition of the wavelet in Part 2 is slightly
460 // different to part 2, although they are mathematically equivalent
461 // here, we identify the simpler form from Part 1 and employ them
462 if (a == 1)
463 { // 5/3 update and any case with a == 1
464 int i = (int)repeat;
465 if (synthesis)
466 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
467 {
468 v128_t s1 = wasm_v128_load((v128_t*)src1);
469 v128_t s2 = wasm_v128_load((v128_t*)src2);
470 v128_t d = wasm_v128_load((v128_t*)dst);
471 v128_t t = wasm_i64x2_add(s1, s2);
472 v128_t v = wasm_i64x2_add(vb, t);
473 v128_t w = wasm_i64x2_shr(v, e);
474 d = wasm_i64x2_sub(d, w);
475 wasm_v128_store((v128_t*)dst, d);
476 }
477 else
478 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
479 {
480 v128_t s1 = wasm_v128_load((v128_t*)src1);
481 v128_t s2 = wasm_v128_load((v128_t*)src2);
482 v128_t d = wasm_v128_load((v128_t*)dst);
483 v128_t t = wasm_i64x2_add(s1, s2);
484 v128_t v = wasm_i64x2_add(vb, t);
485 v128_t w = wasm_i64x2_shr(v, e);
486 d = wasm_i64x2_add(d, w);
487 wasm_v128_store((v128_t*)dst, d);
488 }
489 }
490 else if (a == -1 && b == 1 && e == 1)
491 { // 5/3 predict
492 int i = (int)repeat;
493 if (synthesis)
494 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
495 {
496 v128_t s1 = wasm_v128_load((v128_t*)src1);
497 v128_t s2 = wasm_v128_load((v128_t*)src2);
498 v128_t d = wasm_v128_load((v128_t*)dst);
499 v128_t t = wasm_i64x2_add(s1, s2);
500 v128_t w = wasm_i64x2_shr(t, e);
501 d = wasm_i64x2_add(d, w);
502 wasm_v128_store((v128_t*)dst, d);
503 }
504 else
505 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
506 {
507 v128_t s1 = wasm_v128_load((v128_t*)src1);
508 v128_t s2 = wasm_v128_load((v128_t*)src2);
509 v128_t d = wasm_v128_load((v128_t*)dst);
510 v128_t t = wasm_i64x2_add(s1, s2);
511 v128_t w = wasm_i64x2_shr(t, e);
512 d = wasm_i64x2_sub(d, w);
513 wasm_v128_store((v128_t*)dst, d);
514 }
515 }
516 else if (a == -1)
517 { // any case with a == -1, which is not 5/3 predict
518 int i = (int)repeat;
519 if (synthesis)
520 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
521 {
522 v128_t s1 = wasm_v128_load((v128_t*)src1);
523 v128_t s2 = wasm_v128_load((v128_t*)src2);
524 v128_t d = wasm_v128_load((v128_t*)dst);
525 v128_t t = wasm_i64x2_add(s1, s2);
526 v128_t v = wasm_i64x2_sub(vb, t);
527 v128_t w = wasm_i64x2_shr(v, e);
528 d = wasm_i64x2_sub(d, w);
529 wasm_v128_store((v128_t*)dst, d);
530 }
531 else
532 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
533 {
534 v128_t s1 = wasm_v128_load((v128_t*)src1);
535 v128_t s2 = wasm_v128_load((v128_t*)src2);
536 v128_t d = wasm_v128_load((v128_t*)dst);
537 v128_t t = wasm_i64x2_add(s1, s2);
538 v128_t v = wasm_i64x2_sub(vb, t);
539 v128_t w = wasm_i64x2_shr(v, e);
540 d = wasm_i64x2_add(d, w);
541 wasm_v128_store((v128_t*)dst, d);
542 }
543 }
544 else
545 { // general case
546 int i = (int)repeat;
547 if (synthesis)
548 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
549 {
550 v128_t s1 = wasm_v128_load((v128_t*)src1);
551 v128_t s2 = wasm_v128_load((v128_t*)src2);
552 v128_t d = wasm_v128_load((v128_t*)dst);
553 v128_t t = wasm_i64x2_add(s1, s2);
554 v128_t u = wasm_i64x2_mul(va, t);
555 v128_t v = wasm_i64x2_add(vb, u);
556 v128_t w = wasm_i64x2_shr(v, e);
557 d = wasm_i64x2_sub(d, w);
558 wasm_v128_store((v128_t*)dst, d);
559 }
560 else
561 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
562 {
563 v128_t s1 = wasm_v128_load((v128_t*)src1);
564 v128_t s2 = wasm_v128_load((v128_t*)src2);
565 v128_t d = wasm_v128_load((v128_t*)dst);
566 v128_t t = wasm_i64x2_add(s1, s2);
567 v128_t u = wasm_i64x2_mul(va, t);
568 v128_t v = wasm_i64x2_add(vb, u);
569 v128_t w = wasm_i64x2_shr(v, e);
570 d = wasm_i64x2_add(d, w);
571 wasm_v128_store((v128_t*)dst, d);
572 }
573 }
574 }
575
577 void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig,
578 const line_buf* other, const line_buf* aug,
579 ui32 repeat, bool synthesis)
580 {
581 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
582 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
583 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
584 {
585 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
586 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
587 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
588 wasm_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
589 }
590 else
591 {
592 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
593 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
594 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
595 wasm_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
596 }
597 }
598
600 static
601 void wasm_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
602 const line_buf* hdst, const line_buf* src,
603 ui32 width, bool even)
604 {
605 if (width > 1)
606 {
607 // combine both lsrc and hsrc into dst
608 {
609 float* dpl = even ? ldst->f32 : hdst->f32;
610 float* dph = even ? hdst->f32 : ldst->f32;
611 float* sp = src->f32;
612 int w = (int)width;
613 wasm_deinterleave32(dpl, dph, sp, w);
614 }
615
616 si32* hp = hdst->i32, * lp = ldst->i32;
617 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
618 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
619 ui32 num_steps = atk->get_num_steps();
620 for (ui32 j = num_steps; j > 0; --j)
621 {
622 // first lifting step
623 const lifting_step* s = atk->get_step(j - 1);
624 const si32 a = s->rev.Aatk;
625 const si32 b = s->rev.Batk;
626 const ui8 e = s->rev.Eatk;
627 v128_t va = wasm_i32x4_splat(a);
628 v128_t vb = wasm_i32x4_splat(b);
629
630 // extension
631 lp[-1] = lp[0];
632 lp[l_width] = lp[l_width - 1];
633 // lifting step
634 const si32* sp = lp;
635 si32* dp = hp;
636 if (a == 1)
637 { // 5/3 update and any case with a == 1
638 int i = (int)h_width;
639 if (even)
640 {
641 for (; i > 0; i -= 4, sp += 4, dp += 4)
642 {
643 v128_t s1 = wasm_v128_load((v128_t*)sp);
644 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
645 v128_t d = wasm_v128_load((v128_t*)dp);
646 v128_t t = wasm_i32x4_add(s1, s2);
647 v128_t v = wasm_i32x4_add(vb, t);
648 v128_t w = wasm_i32x4_shr(v, e);
649 d = wasm_i32x4_add(d, w);
650 wasm_v128_store((v128_t*)dp, d);
651 }
652 }
653 else
654 {
655 for (; i > 0; i -= 4, sp += 4, dp += 4)
656 {
657 v128_t s1 = wasm_v128_load((v128_t*)sp);
658 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
659 v128_t d = wasm_v128_load((v128_t*)dp);
660 v128_t t = wasm_i32x4_add(s1, s2);
661 v128_t v = wasm_i32x4_add(vb, t);
662 v128_t w = wasm_i32x4_shr(v, e);
663 d = wasm_i32x4_add(d, w);
664 wasm_v128_store((v128_t*)dp, d);
665 }
666 }
667 }
668 else if (a == -1 && b == 1 && e == 1)
669 { // 5/3 predict
670 int i = (int)h_width;
671 if (even)
672 for (; i > 0; i -= 4, sp += 4, dp += 4)
673 {
674 v128_t s1 = wasm_v128_load((v128_t*)sp);
675 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
676 v128_t d = wasm_v128_load((v128_t*)dp);
677 v128_t t = wasm_i32x4_add(s1, s2);
678 v128_t w = wasm_i32x4_shr(t, e);
679 d = wasm_i32x4_sub(d, w);
680 wasm_v128_store((v128_t*)dp, d);
681 }
682 else
683 for (; i > 0; i -= 4, sp += 4, dp += 4)
684 {
685 v128_t s1 = wasm_v128_load((v128_t*)sp);
686 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
687 v128_t d = wasm_v128_load((v128_t*)dp);
688 v128_t t = wasm_i32x4_add(s1, s2);
689 v128_t w = wasm_i32x4_shr(t, e);
690 d = wasm_i32x4_sub(d, w);
691 wasm_v128_store((v128_t*)dp, d);
692 }
693 }
694 else if (a == -1)
695 { // any case with a == -1, which is not 5/3 predict
696 int i = (int)h_width;
697 if (even)
698 for (; i > 0; i -= 4, sp += 4, dp += 4)
699 {
700 v128_t s1 = wasm_v128_load((v128_t*)sp);
701 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
702 v128_t d = wasm_v128_load((v128_t*)dp);
703 v128_t t = wasm_i32x4_add(s1, s2);
704 v128_t v = wasm_i32x4_sub(vb, t);
705 v128_t w = wasm_i32x4_shr(v, e);
706 d = wasm_i32x4_add(d, w);
707 wasm_v128_store((v128_t*)dp, d);
708 }
709 else
710 for (; i > 0; i -= 4, sp += 4, dp += 4)
711 {
712 v128_t s1 = wasm_v128_load((v128_t*)sp);
713 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
714 v128_t d = wasm_v128_load((v128_t*)dp);
715 v128_t t = wasm_i32x4_add(s1, s2);
716 v128_t v = wasm_i32x4_sub(vb, t);
717 v128_t w = wasm_i32x4_shr(v, e);
718 d = wasm_i32x4_add(d, w);
719 wasm_v128_store((v128_t*)dp, d);
720 }
721 }
722 else
723 { // general case
724 int i = (int)h_width;
725 if (even)
726 for (; i > 0; i -= 4, sp += 4, dp += 4)
727 {
728 v128_t s1 = wasm_v128_load((v128_t*)sp);
729 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
730 v128_t d = wasm_v128_load((v128_t*)dp);
731 v128_t t = wasm_i32x4_add(s1, s2);
732 v128_t u = wasm_i32x4_mul(va, t);
733 v128_t v = wasm_i32x4_add(vb, u);
734 v128_t w = wasm_i32x4_shr(v, e);
735 d = wasm_i32x4_add(d, w);
736 wasm_v128_store((v128_t*)dp, d);
737 }
738 else
739 for (; i > 0; i -= 4, sp += 4, dp += 4)
740 {
741 v128_t s1 = wasm_v128_load((v128_t*)sp);
742 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
743 v128_t d = wasm_v128_load((v128_t*)dp);
744 v128_t t = wasm_i32x4_add(s1, s2);
745 v128_t u = wasm_i32x4_mul(va, t);
746 v128_t v = wasm_i32x4_add(vb, u);
747 v128_t w = wasm_i32x4_shr(v, e);
748 d = wasm_i32x4_add(d, w);
749 wasm_v128_store((v128_t*)dp, d);
750 }
751 }
752
753 // swap buffers
754 si32* t = lp; lp = hp; hp = t;
755 even = !even;
756 ui32 w = l_width; l_width = h_width; h_width = w;
757 }
758 }
759 else {
760 if (even)
761 ldst->i32[0] = src->i32[0];
762 else
763 hdst->i32[0] = src->i32[0] << 1;
764 }
765 }
766
768 static
769 void wasm_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
770 const line_buf* hdst, const line_buf* src,
771 ui32 width, bool even)
772 {
773 if (width > 1)
774 {
775 // combine both lsrc and hsrc into dst
776 {
777 double* dpl = (double*)(even ? ldst->p : hdst->p);
778 double* dph = (double*)(even ? hdst->p : ldst->p);
779 double* sp = (double*)src->p;
780 int w = (int)width;
781 wasm_deinterleave64(dpl, dph, sp, w);
782 }
783
784 si64* hp = hdst->i64, * lp = ldst->i64;
785 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
786 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
787 ui32 num_steps = atk->get_num_steps();
788 for (ui32 j = num_steps; j > 0; --j)
789 {
790 // first lifting step
791 const lifting_step* s = atk->get_step(j - 1);
792 const si32 a = s->rev.Aatk;
793 const si32 b = s->rev.Batk;
794 const ui8 e = s->rev.Eatk;
795 v128_t va = wasm_i64x2_splat(a);
796 v128_t vb = wasm_i64x2_splat(b);
797
798 // extension
799 lp[-1] = lp[0];
800 lp[l_width] = lp[l_width - 1];
801 // lifting step
802 const si64* sp = lp;
803 si64* dp = hp;
804 if (a == 1)
805 { // 5/3 update and any case with a == 1
806 int i = (int)h_width;
807 if (even)
808 {
809 for (; i > 0; i -= 2, sp += 2, dp += 2)
810 {
811 v128_t s1 = wasm_v128_load((v128_t*)sp);
812 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
813 v128_t d = wasm_v128_load((v128_t*)dp);
814 v128_t t = wasm_i64x2_add(s1, s2);
815 v128_t v = wasm_i64x2_add(vb, t);
816 v128_t w = wasm_i64x2_shr(v, e);
817 d = wasm_i64x2_add(d, w);
818 wasm_v128_store((v128_t*)dp, d);
819 }
820 }
821 else
822 {
823 for (; i > 0; i -= 2, sp += 2, dp += 2)
824 {
825 v128_t s1 = wasm_v128_load((v128_t*)sp);
826 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
827 v128_t d = wasm_v128_load((v128_t*)dp);
828 v128_t t = wasm_i64x2_add(s1, s2);
829 v128_t v = wasm_i64x2_add(vb, t);
830 v128_t w = wasm_i64x2_shr(v, e);
831 d = wasm_i64x2_add(d, w);
832 wasm_v128_store((v128_t*)dp, d);
833 }
834 }
835 }
836 else if (a == -1 && b == 1 && e == 1)
837 { // 5/3 predict
838 int i = (int)h_width;
839 if (even)
840 for (; i > 0; i -= 2, sp += 2, dp += 2)
841 {
842 v128_t s1 = wasm_v128_load((v128_t*)sp);
843 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
844 v128_t d = wasm_v128_load((v128_t*)dp);
845 v128_t t = wasm_i64x2_add(s1, s2);
846 v128_t w = wasm_i64x2_shr(t, e);
847 d = wasm_i64x2_sub(d, w);
848 wasm_v128_store((v128_t*)dp, d);
849 }
850 else
851 for (; i > 0; i -= 2, sp += 2, dp += 2)
852 {
853 v128_t s1 = wasm_v128_load((v128_t*)sp);
854 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
855 v128_t d = wasm_v128_load((v128_t*)dp);
856 v128_t t = wasm_i64x2_add(s1, s2);
857 v128_t w = wasm_i64x2_shr(t, e);
858 d = wasm_i64x2_sub(d, w);
859 wasm_v128_store((v128_t*)dp, d);
860 }
861 }
862 else if (a == -1)
863 { // any case with a == -1, which is not 5/3 predict
864 int i = (int)h_width;
865 if (even)
866 for (; i > 0; i -= 2, sp += 2, dp += 2)
867 {
868 v128_t s1 = wasm_v128_load((v128_t*)sp);
869 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
870 v128_t d = wasm_v128_load((v128_t*)dp);
871 v128_t t = wasm_i64x2_add(s1, s2);
872 v128_t v = wasm_i64x2_sub(vb, t);
873 v128_t w = wasm_i64x2_shr(v, e);
874 d = wasm_i64x2_add(d, w);
875 wasm_v128_store((v128_t*)dp, d);
876 }
877 else
878 for (; i > 0; i -= 2, sp += 2, dp += 2)
879 {
880 v128_t s1 = wasm_v128_load((v128_t*)sp);
881 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
882 v128_t d = wasm_v128_load((v128_t*)dp);
883 v128_t t = wasm_i64x2_add(s1, s2);
884 v128_t v = wasm_i64x2_sub(vb, t);
885 v128_t w = wasm_i64x2_shr(v, e);
886 d = wasm_i64x2_add(d, w);
887 wasm_v128_store((v128_t*)dp, d);
888 }
889 }
890 else
891 { // general case
892 int i = (int)h_width;
893 if (even)
894 for (; i > 0; i -= 2, sp += 2, dp += 2)
895 {
896 v128_t s1 = wasm_v128_load((v128_t*)sp);
897 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
898 v128_t d = wasm_v128_load((v128_t*)dp);
899 v128_t t = wasm_i64x2_add(s1, s2);
900 v128_t u = wasm_i64x2_mul(va, t);
901 v128_t v = wasm_i64x2_add(vb, u);
902 v128_t w = wasm_i64x2_shr(v, e);
903 d = wasm_i64x2_add(d, w);
904 wasm_v128_store((v128_t*)dp, d);
905 }
906 else
907 for (; i > 0; i -= 2, sp += 2, dp += 2)
908 {
909 v128_t s1 = wasm_v128_load((v128_t*)sp);
910 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
911 v128_t d = wasm_v128_load((v128_t*)dp);
912 v128_t t = wasm_i64x2_add(s1, s2);
913 v128_t u = wasm_i64x2_mul(va, t);
914 v128_t v = wasm_i64x2_add(vb, u);
915 v128_t w = wasm_i64x2_shr(v, e);
916 d = wasm_i64x2_add(d, w);
917 wasm_v128_store((v128_t*)dp, d);
918 }
919 }
920
921 // swap buffers
922 si64* t = lp; lp = hp; hp = t;
923 even = !even;
924 ui32 w = l_width; l_width = h_width; h_width = w;
925 }
926 }
927 else {
928 if (even)
929 ldst->i64[0] = src->i64[0];
930 else
931 hdst->i64[0] = src->i64[0] << 1;
932 }
933 }
934
936 void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
937 const line_buf* hdst, const line_buf* src,
938 ui32 width, bool even)
939 {
940 if (src->flags & line_buf::LFT_32BIT)
941 {
942 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
943 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
944 wasm_rev_horz_ana32(atk, ldst, hdst, src, width, even);
945 }
946 else
947 {
948 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
949 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
950 (src == NULL || src->flags & line_buf::LFT_64BIT));
951 wasm_rev_horz_ana64(atk, ldst, hdst, src, width, even);
952 }
953 }
954
956 void wasm_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
957 const line_buf* lsrc, const line_buf* hsrc,
958 ui32 width, bool even)
959 {
960 if (width > 1)
961 {
962 bool ev = even;
963 si32* oth = hsrc->i32, * aug = lsrc->i32;
964 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
965 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
966 ui32 num_steps = atk->get_num_steps();
967 for (ui32 j = 0; j < num_steps; ++j)
968 {
969 const lifting_step* s = atk->get_step(j);
970 const si32 a = s->rev.Aatk;
971 const si32 b = s->rev.Batk;
972 const ui8 e = s->rev.Eatk;
973 v128_t va = wasm_i32x4_splat(a);
974 v128_t vb = wasm_i32x4_splat(b);
975
976 // extension
977 oth[-1] = oth[0];
978 oth[oth_width] = oth[oth_width - 1];
979 // lifting step
980 const si32* sp = oth;
981 si32* dp = aug;
982 if (a == 1)
983 { // 5/3 update and any case with a == 1
984 int i = (int)aug_width;
985 if (ev)
986 {
987 for (; i > 0; i -= 4, sp += 4, dp += 4)
988 {
989 v128_t s1 = wasm_v128_load((v128_t*)sp);
990 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
991 v128_t d = wasm_v128_load((v128_t*)dp);
992 v128_t t = wasm_i32x4_add(s1, s2);
993 v128_t v = wasm_i32x4_add(vb, t);
994 v128_t w = wasm_i32x4_shr(v, e);
995 d = wasm_i32x4_sub(d, w);
996 wasm_v128_store((v128_t*)dp, d);
997 }
998 }
999 else
1000 {
1001 for (; i > 0; i -= 4, sp += 4, dp += 4)
1002 {
1003 v128_t s1 = wasm_v128_load((v128_t*)sp);
1004 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1005 v128_t d = wasm_v128_load((v128_t*)dp);
1006 v128_t t = wasm_i32x4_add(s1, s2);
1007 v128_t v = wasm_i32x4_add(vb, t);
1008 v128_t w = wasm_i32x4_shr(v, e);
1009 d = wasm_i32x4_sub(d, w);
1010 wasm_v128_store((v128_t*)dp, d);
1011 }
1012 }
1013 }
1014 else if (a == -1 && b == 1 && e == 1)
1015 { // 5/3 predict
1016 int i = (int)aug_width;
1017 if (ev)
1018 for (; i > 0; i -= 4, sp += 4, dp += 4)
1019 {
1020 v128_t s1 = wasm_v128_load((v128_t*)sp);
1021 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1022 v128_t d = wasm_v128_load((v128_t*)dp);
1023 v128_t t = wasm_i32x4_add(s1, s2);
1024 v128_t w = wasm_i32x4_shr(t, e);
1025 d = wasm_i32x4_add(d, w);
1026 wasm_v128_store((v128_t*)dp, d);
1027 }
1028 else
1029 for (; i > 0; i -= 4, sp += 4, dp += 4)
1030 {
1031 v128_t s1 = wasm_v128_load((v128_t*)sp);
1032 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1033 v128_t d = wasm_v128_load((v128_t*)dp);
1034 v128_t t = wasm_i32x4_add(s1, s2);
1035 v128_t w = wasm_i32x4_shr(t, e);
1036 d = wasm_i32x4_add(d, w);
1037 wasm_v128_store((v128_t*)dp, d);
1038 }
1039 }
1040 else if (a == -1)
1041 { // any case with a == -1, which is not 5/3 predict
1042 int i = (int)aug_width;
1043 if (ev)
1044 for (; i > 0; i -= 4, sp += 4, dp += 4)
1045 {
1046 v128_t s1 = wasm_v128_load((v128_t*)sp);
1047 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1048 v128_t d = wasm_v128_load((v128_t*)dp);
1049 v128_t t = wasm_i32x4_add(s1, s2);
1050 v128_t v = wasm_i32x4_sub(vb, t);
1051 v128_t w = wasm_i32x4_shr(v, e);
1052 d = wasm_i32x4_sub(d, w);
1053 wasm_v128_store((v128_t*)dp, d);
1054 }
1055 else
1056 for (; i > 0; i -= 4, sp += 4, dp += 4)
1057 {
1058 v128_t s1 = wasm_v128_load((v128_t*)sp);
1059 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1060 v128_t d = wasm_v128_load((v128_t*)dp);
1061 v128_t t = wasm_i32x4_add(s1, s2);
1062 v128_t v = wasm_i32x4_sub(vb, t);
1063 v128_t w = wasm_i32x4_shr(v, e);
1064 d = wasm_i32x4_sub(d, w);
1065 wasm_v128_store((v128_t*)dp, d);
1066 }
1067 }
1068 else
1069 { // general case
1070 int i = (int)aug_width;
1071 if (ev)
1072 for (; i > 0; i -= 4, sp += 4, dp += 4)
1073 {
1074 v128_t s1 = wasm_v128_load((v128_t*)sp);
1075 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1076 v128_t d = wasm_v128_load((v128_t*)dp);
1077 v128_t t = wasm_i32x4_add(s1, s2);
1078 v128_t u = wasm_i32x4_mul(va, t);
1079 v128_t v = wasm_i32x4_add(vb, u);
1080 v128_t w = wasm_i32x4_shr(v, e);
1081 d = wasm_i32x4_sub(d, w);
1082 wasm_v128_store((v128_t*)dp, d);
1083 }
1084 else
1085 for (; i > 0; i -= 4, sp += 4, dp += 4)
1086 {
1087 v128_t s1 = wasm_v128_load((v128_t*)sp);
1088 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1089 v128_t d = wasm_v128_load((v128_t*)dp);
1090 v128_t t = wasm_i32x4_add(s1, s2);
1091 v128_t u = wasm_i32x4_mul(va, t);
1092 v128_t v = wasm_i32x4_add(vb, u);
1093 v128_t w = wasm_i32x4_shr(v, e);
1094 d = wasm_i32x4_sub(d, w);
1095 wasm_v128_store((v128_t*)dp, d);
1096 }
1097 }
1098
1099 // swap buffers
1100 si32* t = aug; aug = oth; oth = t;
1101 ev = !ev;
1102 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1103 }
1104
1105 // combine both lsrc and hsrc into dst
1106 {
1107 float* dp = dst->f32;
1108 float* spl = even ? lsrc->f32 : hsrc->f32;
1109 float* sph = even ? hsrc->f32 : lsrc->f32;
1110 int w = (int)width;
1111 wasm_interleave32(dp, spl, sph, w);
1112 }
1113 }
1114 else {
1115 if (even)
1116 dst->i32[0] = lsrc->i32[0];
1117 else
1118 dst->i32[0] = hsrc->i32[0] >> 1;
1119 }
1120 }
1121
1123 void wasm_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
1124 const line_buf* lsrc, const line_buf* hsrc,
1125 ui32 width, bool even)
1126 {
1127 if (width > 1)
1128 {
1129 bool ev = even;
1130 si64* oth = hsrc->i64, * aug = lsrc->i64;
1131 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
1132 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
1133 ui32 num_steps = atk->get_num_steps();
1134 for (ui32 j = 0; j < num_steps; ++j)
1135 {
1136 const lifting_step* s = atk->get_step(j);
1137 const si32 a = s->rev.Aatk;
1138 const si32 b = s->rev.Batk;
1139 const ui8 e = s->rev.Eatk;
1140 v128_t va = wasm_i64x2_splat(a);
1141 v128_t vb = wasm_i64x2_splat(b);
1142
1143 // extension
1144 oth[-1] = oth[0];
1145 oth[oth_width] = oth[oth_width - 1];
1146 // lifting step
1147 const si64* sp = oth;
1148 si64* dp = aug;
1149 if (a == 1)
1150 { // 5/3 update and any case with a == 1
1151 int i = (int)aug_width;
1152 if (ev)
1153 {
1154 for (; i > 0; i -= 2, sp += 2, dp += 2)
1155 {
1156 v128_t s1 = wasm_v128_load((v128_t*)sp);
1157 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1158 v128_t d = wasm_v128_load((v128_t*)dp);
1159 v128_t t = wasm_i64x2_add(s1, s2);
1160 v128_t v = wasm_i64x2_add(vb, t);
1161 v128_t w = wasm_i64x2_shr(v, e);
1162 d = wasm_i64x2_sub(d, w);
1163 wasm_v128_store((v128_t*)dp, d);
1164 }
1165 }
1166 else
1167 {
1168 for (; i > 0; i -= 2, sp += 2, dp += 2)
1169 {
1170 v128_t s1 = wasm_v128_load((v128_t*)sp);
1171 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1172 v128_t d = wasm_v128_load((v128_t*)dp);
1173 v128_t t = wasm_i64x2_add(s1, s2);
1174 v128_t v = wasm_i64x2_add(vb, t);
1175 v128_t w = wasm_i64x2_shr(v, e);
1176 d = wasm_i64x2_sub(d, w);
1177 wasm_v128_store((v128_t*)dp, d);
1178 }
1179 }
1180 }
1181 else if (a == -1 && b == 1 && e == 1)
1182 { // 5/3 predict
1183 int i = (int)aug_width;
1184 if (ev)
1185 for (; i > 0; i -= 2, sp += 2, dp += 2)
1186 {
1187 v128_t s1 = wasm_v128_load((v128_t*)sp);
1188 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1189 v128_t d = wasm_v128_load((v128_t*)dp);
1190 v128_t t = wasm_i64x2_add(s1, s2);
1191 v128_t w = wasm_i64x2_shr(t, e);
1192 d = wasm_i64x2_add(d, w);
1193 wasm_v128_store((v128_t*)dp, d);
1194 }
1195 else
1196 for (; i > 0; i -= 2, sp += 2, dp += 2)
1197 {
1198 v128_t s1 = wasm_v128_load((v128_t*)sp);
1199 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1200 v128_t d = wasm_v128_load((v128_t*)dp);
1201 v128_t t = wasm_i64x2_add(s1, s2);
1202 v128_t w = wasm_i64x2_shr(t, e);
1203 d = wasm_i64x2_add(d, w);
1204 wasm_v128_store((v128_t*)dp, d);
1205 }
1206 }
1207 else if (a == -1)
1208 { // any case with a == -1, which is not 5/3 predict
1209 int i = (int)aug_width;
1210 if (ev)
1211 for (; i > 0; i -= 2, sp += 2, dp += 2)
1212 {
1213 v128_t s1 = wasm_v128_load((v128_t*)sp);
1214 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1215 v128_t d = wasm_v128_load((v128_t*)dp);
1216 v128_t t = wasm_i64x2_add(s1, s2);
1217 v128_t v = wasm_i64x2_sub(vb, t);
1218 v128_t w = wasm_i64x2_shr(v, e);
1219 d = wasm_i64x2_sub(d, w);
1220 wasm_v128_store((v128_t*)dp, d);
1221 }
1222 else
1223 for (; i > 0; i -= 2, sp += 2, dp += 2)
1224 {
1225 v128_t s1 = wasm_v128_load((v128_t*)sp);
1226 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1227 v128_t d = wasm_v128_load((v128_t*)dp);
1228 v128_t t = wasm_i64x2_add(s1, s2);
1229 v128_t v = wasm_i64x2_sub(vb, t);
1230 v128_t w = wasm_i64x2_shr(v, e);
1231 d = wasm_i64x2_sub(d, w);
1232 wasm_v128_store((v128_t*)dp, d);
1233 }
1234 }
1235 else
1236 { // general case
1237 int i = (int)aug_width;
1238 if (ev)
1239 for (; i > 0; i -= 2, sp += 2, dp += 2)
1240 {
1241 v128_t s1 = wasm_v128_load((v128_t*)sp);
1242 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1243 v128_t d = wasm_v128_load((v128_t*)dp);
1244 v128_t t = wasm_i64x2_add(s1, s2);
1245 v128_t u = wasm_i64x2_mul(va, t);
1246 v128_t v = wasm_i64x2_add(vb, u);
1247 v128_t w = wasm_i64x2_shr(v, e);
1248 d = wasm_i64x2_sub(d, w);
1249 wasm_v128_store((v128_t*)dp, d);
1250 }
1251 else
1252 for (; i > 0; i -= 2, sp += 2, dp += 2)
1253 {
1254 v128_t s1 = wasm_v128_load((v128_t*)sp);
1255 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1256 v128_t d = wasm_v128_load((v128_t*)dp);
1257 v128_t t = wasm_i64x2_add(s1, s2);
1258 v128_t u = wasm_i64x2_mul(va, t);
1259 v128_t v = wasm_i64x2_add(vb, u);
1260 v128_t w = wasm_i64x2_shr(v, e);
1261 d = wasm_i64x2_sub(d, w);
1262 wasm_v128_store((v128_t*)dp, d);
1263 }
1264 }
1265
1266 // swap buffers
1267 si64* t = aug; aug = oth; oth = t;
1268 ev = !ev;
1269 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1270 }
1271
1272 // combine both lsrc and hsrc into dst
1273 {
1274 double* dp = (double*)dst->p;
1275 double* spl = (double*)(even ? lsrc->p : hsrc->p);
1276 double* sph = (double*)(even ? hsrc->p : lsrc->p);
1277 int w = (int)width;
1278 wasm_interleave64(dp, spl, sph, w);
1279 }
1280 }
1281 else {
1282 if (even)
1283 dst->i64[0] = lsrc->i64[0];
1284 else
1285 dst->i64[0] = hsrc->i64[0] >> 1;
1286 }
1287 }
1288
1290 void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst,
1291 const line_buf* lsrc, const line_buf* hsrc,
1292 ui32 width, bool even)
1293 {
1294 if (dst->flags & line_buf::LFT_32BIT)
1295 {
1296 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
1297 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
1298 wasm_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1299 }
1300 else
1301 {
1302 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
1303 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
1304 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
1305 wasm_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
1306 }
1307 }
1308
1309 } // !local
1310} // !ojph
float * f32
Definition ojph_mem.h:162
static void wasm_multiply_const(float *p, float f, int width)
static void wasm_interleave64(double *dp, double *spl, double *sph, int width)
void wasm_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void wasm_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void wasm_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void wasm_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void wasm_interleave32(float *dp, float *spl, float *sph, int width)
void wasm_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void wasm_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void wasm_deinterleave32(float *dpl, float *dph, float *sp, int width)
static void wasm_deinterleave64(double *dpl, double *dph, double *sp, int width)
static void wasm_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
const lifting_step * get_step(ui32 s) const