OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_avx2.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include <climits>
39#include <cstdio>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_params.h"
46
47#include "ojph_transform.h"
49
50#include <immintrin.h>
51
52namespace ojph {
53 namespace local {
54
56 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
57 static inline
58 __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
59 {
60 // note than m must be obtained using
61 // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
62 __m256i x = _mm256_srli_epi64(a, amt);
63 x = _mm256_xor_si256(x, m);
64 __m256i result = _mm256_sub_epi64(x, m);
65 return result;
66 }
67
69 static inline
70 void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width)
71 {
72 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
73 {
74 __m256 a = _mm256_load_ps(sp);
75 __m256 b = _mm256_load_ps(sp + 8);
76 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
77 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
78 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
79 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
80 _mm256_store_ps(dpl, e);
81 _mm256_store_ps(dph, f);
82 }
83 }
84
86 static inline
87 void avx2_interleave32(float* dp, float* spl, float* sph, int width)
88 {
89 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
90 {
91 __m256 a = _mm256_load_ps(spl);
92 __m256 b = _mm256_load_ps(sph);
93 __m256 c = _mm256_unpacklo_ps(a, b);
94 __m256 d = _mm256_unpackhi_ps(a, b);
95 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
96 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
97 _mm256_store_ps(dp, e);
98 _mm256_store_ps(dp + 8, f);
99 }
100 }
101
103 static inline
104 void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width)
105 {
106 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
107 {
108 __m256d a = _mm256_load_pd(sp);
109 __m256d b = _mm256_load_pd(sp + 4);
110 __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
111 __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
112 __m256d e = _mm256_shuffle_pd(c, d, 0x0);
113 __m256d f = _mm256_shuffle_pd(c, d, 0xF);
114 _mm256_store_pd(dpl, e);
115 _mm256_store_pd(dph, f);
116 }
117 }
118
120 static inline
121 void avx2_interleave64(double* dp, double* spl, double* sph, int width)
122 {
123 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
124 {
125 __m256d a = _mm256_load_pd(spl);
126 __m256d b = _mm256_load_pd(sph);
127 __m256d c = _mm256_unpacklo_pd(a, b);
128 __m256d d = _mm256_unpackhi_pd(a, b);
129 __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
130 __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
131 _mm256_store_pd(dp, e);
132 _mm256_store_pd(dp + 4, f);
133 }
134 }
135
137 static
138 void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig,
139 const line_buf* other, const line_buf* aug,
140 ui32 repeat, bool synthesis)
141 {
142 const si32 a = s->rev.Aatk;
143 const si32 b = s->rev.Batk;
144 const ui8 e = s->rev.Eatk;
145 __m256i va = _mm256_set1_epi32(a);
146 __m256i vb = _mm256_set1_epi32(b);
147
148 si32* dst = aug->i32;
149 const si32* src1 = sig->i32, * src2 = other->i32;
150 // The general definition of the wavelet in Part 2 is slightly
151 // different to part 2, although they are mathematically equivalent
152 // here, we identify the simpler form from Part 1 and employ them
153 if (a == 1)
154 { // 5/3 update and any case with a == 1
155 int i = (int)repeat;
156 if (synthesis)
157 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
158 {
159 __m256i s1 = _mm256_load_si256((__m256i*)src1);
160 __m256i s2 = _mm256_load_si256((__m256i*)src2);
161 __m256i d = _mm256_load_si256((__m256i*)dst);
162 __m256i t = _mm256_add_epi32(s1, s2);
163 __m256i v = _mm256_add_epi32(vb, t);
164 __m256i w = _mm256_srai_epi32(v, e);
165 d = _mm256_sub_epi32(d, w);
166 _mm256_store_si256((__m256i*)dst, d);
167 }
168 else
169 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
170 {
171 __m256i s1 = _mm256_load_si256((__m256i*)src1);
172 __m256i s2 = _mm256_load_si256((__m256i*)src2);
173 __m256i d = _mm256_load_si256((__m256i*)dst);
174 __m256i t = _mm256_add_epi32(s1, s2);
175 __m256i v = _mm256_add_epi32(vb, t);
176 __m256i w = _mm256_srai_epi32(v, e);
177 d = _mm256_add_epi32(d, w);
178 _mm256_store_si256((__m256i*)dst, d);
179 }
180 }
181 else if (a == -1 && b == 1 && e == 1)
182 { // 5/3 predict
183 int i = (int)repeat;
184 if (synthesis)
185 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
186 {
187 __m256i s1 = _mm256_load_si256((__m256i*)src1);
188 __m256i s2 = _mm256_load_si256((__m256i*)src2);
189 __m256i d = _mm256_load_si256((__m256i*)dst);
190 __m256i t = _mm256_add_epi32(s1, s2);
191 __m256i w = _mm256_srai_epi32(t, e);
192 d = _mm256_add_epi32(d, w);
193 _mm256_store_si256((__m256i*)dst, d);
194 }
195 else
196 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
197 {
198 __m256i s1 = _mm256_load_si256((__m256i*)src1);
199 __m256i s2 = _mm256_load_si256((__m256i*)src2);
200 __m256i d = _mm256_load_si256((__m256i*)dst);
201 __m256i t = _mm256_add_epi32(s1, s2);
202 __m256i w = _mm256_srai_epi32(t, e);
203 d = _mm256_sub_epi32(d, w);
204 _mm256_store_si256((__m256i*)dst, d);
205 }
206 }
207 else if (a == -1)
208 { // any case with a == -1, which is not 5/3 predict
209 int i = (int)repeat;
210 if (synthesis)
211 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
212 {
213 __m256i s1 = _mm256_load_si256((__m256i*)src1);
214 __m256i s2 = _mm256_load_si256((__m256i*)src2);
215 __m256i d = _mm256_load_si256((__m256i*)dst);
216 __m256i t = _mm256_add_epi32(s1, s2);
217 __m256i v = _mm256_sub_epi32(vb, t);
218 __m256i w = _mm256_srai_epi32(v, e);
219 d = _mm256_sub_epi32(d, w);
220 _mm256_store_si256((__m256i*)dst, d);
221 }
222 else
223 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
224 {
225 __m256i s1 = _mm256_load_si256((__m256i*)src1);
226 __m256i s2 = _mm256_load_si256((__m256i*)src2);
227 __m256i d = _mm256_load_si256((__m256i*)dst);
228 __m256i t = _mm256_add_epi32(s1, s2);
229 __m256i v = _mm256_sub_epi32(vb, t);
230 __m256i w = _mm256_srai_epi32(v, e);
231 d = _mm256_add_epi32(d, w);
232 _mm256_store_si256((__m256i*)dst, d);
233 }
234 }
235 else { // general case
236 int i = (int)repeat;
237 if (synthesis)
238 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
239 {
240 __m256i s1 = _mm256_load_si256((__m256i*)src1);
241 __m256i s2 = _mm256_load_si256((__m256i*)src2);
242 __m256i d = _mm256_load_si256((__m256i*)dst);
243 __m256i t = _mm256_add_epi32(s1, s2);
244 __m256i u = _mm256_mullo_epi32(va, t);
245 __m256i v = _mm256_add_epi32(vb, u);
246 __m256i w = _mm256_srai_epi32(v, e);
247 d = _mm256_sub_epi32(d, w);
248 _mm256_store_si256((__m256i*)dst, d);
249 }
250 else
251 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
252 {
253 __m256i s1 = _mm256_load_si256((__m256i*)src1);
254 __m256i s2 = _mm256_load_si256((__m256i*)src2);
255 __m256i d = _mm256_load_si256((__m256i*)dst);
256 __m256i t = _mm256_add_epi32(s1, s2);
257 __m256i u = _mm256_mullo_epi32(va, t);
258 __m256i v = _mm256_add_epi32(vb, u);
259 __m256i w = _mm256_srai_epi32(v, e);
260 d = _mm256_add_epi32(d, w);
261 _mm256_store_si256((__m256i*)dst, d);
262 }
263 }
264 }
265
267 static
268 void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig,
269 const line_buf* other, const line_buf* aug,
270 ui32 repeat, bool synthesis)
271 {
272 const si32 a = s->rev.Aatk;
273 const si32 b = s->rev.Batk;
274 const ui8 e = s->rev.Eatk;
275 __m256i vb = _mm256_set1_epi64x(b);
276 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
277
278 si64* dst = aug->i64;
279 const si64* src1 = sig->i64, * src2 = other->i64;
280 // The general definition of the wavelet in Part 2 is slightly
281 // different to part 2, although they are mathematically equivalent
282 // here, we identify the simpler form from Part 1 and employ them
283 if (a == 1)
284 { // 5/3 update and any case with a == 1
285 int i = (int)repeat;
286 if (synthesis)
287 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
288 {
289 __m256i s1 = _mm256_load_si256((__m256i*)src1);
290 __m256i s2 = _mm256_load_si256((__m256i*)src2);
291 __m256i d = _mm256_load_si256((__m256i*)dst);
292 __m256i t = _mm256_add_epi64(s1, s2);
293 __m256i v = _mm256_add_epi64(vb, t);
294 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
295 d = _mm256_sub_epi64(d, w);
296 _mm256_store_si256((__m256i*)dst, d);
297 }
298 else
299 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
300 {
301 __m256i s1 = _mm256_load_si256((__m256i*)src1);
302 __m256i s2 = _mm256_load_si256((__m256i*)src2);
303 __m256i d = _mm256_load_si256((__m256i*)dst);
304 __m256i t = _mm256_add_epi64(s1, s2);
305 __m256i v = _mm256_add_epi64(vb, t);
306 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
307 d = _mm256_add_epi64(d, w);
308 _mm256_store_si256((__m256i*)dst, d);
309 }
310 }
311 else if (a == -1 && b == 1 && e == 1)
312 { // 5/3 predict
313 int i = (int)repeat;
314 if (synthesis)
315 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
316 {
317 __m256i s1 = _mm256_load_si256((__m256i*)src1);
318 __m256i s2 = _mm256_load_si256((__m256i*)src2);
319 __m256i d = _mm256_load_si256((__m256i*)dst);
320 __m256i t = _mm256_add_epi64(s1, s2);
321 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
322 d = _mm256_add_epi64(d, w);
323 _mm256_store_si256((__m256i*)dst, d);
324 }
325 else
326 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
327 {
328 __m256i s1 = _mm256_load_si256((__m256i*)src1);
329 __m256i s2 = _mm256_load_si256((__m256i*)src2);
330 __m256i d = _mm256_load_si256((__m256i*)dst);
331 __m256i t = _mm256_add_epi64(s1, s2);
332 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
333 d = _mm256_sub_epi64(d, w);
334 _mm256_store_si256((__m256i*)dst, d);
335 }
336 }
337 else if (a == -1)
338 { // any case with a == -1, which is not 5/3 predict
339 int i = (int)repeat;
340 if (synthesis)
341 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
342 {
343 __m256i s1 = _mm256_load_si256((__m256i*)src1);
344 __m256i s2 = _mm256_load_si256((__m256i*)src2);
345 __m256i d = _mm256_load_si256((__m256i*)dst);
346 __m256i t = _mm256_add_epi64(s1, s2);
347 __m256i v = _mm256_sub_epi64(vb, t);
348 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
349 d = _mm256_sub_epi64(d, w);
350 _mm256_store_si256((__m256i*)dst, d);
351 }
352 else
353 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
354 {
355 __m256i s1 = _mm256_load_si256((__m256i*)src1);
356 __m256i s2 = _mm256_load_si256((__m256i*)src2);
357 __m256i d = _mm256_load_si256((__m256i*)dst);
358 __m256i t = _mm256_add_epi64(s1, s2);
359 __m256i v = _mm256_sub_epi64(vb, t);
360 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
361 d = _mm256_add_epi64(d, w);
362 _mm256_store_si256((__m256i*)dst, d);
363 }
364 }
365 else { // general case
366 // 64bit multiplication is not supported in avx2;
367 // in particular, _mm256_mullo_epi64.
368 if (synthesis)
369 for (ui32 i = repeat; i > 0; --i)
370 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
371 else
372 for (ui32 i = repeat; i > 0; --i)
373 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
374 }
375 }
376
378 void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig,
379 const line_buf* other, const line_buf* aug,
380 ui32 repeat, bool synthesis)
381 {
382 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
383 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
384 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
385 {
386 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
387 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
388 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
389 avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
390 }
391 else
392 {
393 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
394 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
395 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
396 avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
397 }
398 }
399
401 static
402 void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
403 const line_buf* hdst, const line_buf* src,
404 ui32 width, bool even)
405 {
406 if (width > 1)
407 {
408 // split src into ldst and hdst
409 {
410 float* dpl = even ? ldst->f32 : hdst->f32;
411 float* dph = even ? hdst->f32 : ldst->f32;
412 float* sp = src->f32;
413 int w = (int)width;
414 avx2_deinterleave32(dpl, dph, sp, w);
415 }
416
417 si32* hp = hdst->i32, * lp = ldst->i32;
418 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
419 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
420 ui32 num_steps = atk->get_num_steps();
421 for (ui32 j = num_steps; j > 0; --j)
422 {
423 // first lifting step
424 const lifting_step* s = atk->get_step(j - 1);
425 const si32 a = s->rev.Aatk;
426 const si32 b = s->rev.Batk;
427 const ui8 e = s->rev.Eatk;
428 __m256i va = _mm256_set1_epi32(a);
429 __m256i vb = _mm256_set1_epi32(b);
430
431 // extension
432 lp[-1] = lp[0];
433 lp[l_width] = lp[l_width - 1];
434 // lifting step
435 const si32* sp = lp;
436 si32* dp = hp;
437 if (a == 1)
438 { // 5/3 update and any case with a == 1
439 int i = (int)h_width;
440 if (even)
441 {
442 for (; i > 0; i -= 8, sp += 8, dp += 8)
443 {
444 __m256i s1 = _mm256_load_si256((__m256i*)sp);
445 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
446 __m256i d = _mm256_load_si256((__m256i*)dp);
447 __m256i t = _mm256_add_epi32(s1, s2);
448 __m256i v = _mm256_add_epi32(vb, t);
449 __m256i w = _mm256_srai_epi32(v, e);
450 d = _mm256_add_epi32(d, w);
451 _mm256_store_si256((__m256i*)dp, d);
452 }
453 }
454 else
455 {
456 for (; i > 0; i -= 8, sp += 8, dp += 8)
457 {
458 __m256i s1 = _mm256_load_si256((__m256i*)sp);
459 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
460 __m256i d = _mm256_load_si256((__m256i*)dp);
461 __m256i t = _mm256_add_epi32(s1, s2);
462 __m256i v = _mm256_add_epi32(vb, t);
463 __m256i w = _mm256_srai_epi32(v, e);
464 d = _mm256_add_epi32(d, w);
465 _mm256_store_si256((__m256i*)dp, d);
466 }
467 }
468 }
469 else if (a == -1 && b == 1 && e == 1)
470 { // 5/3 predict
471 int i = (int)h_width;
472 if (even)
473 for (; i > 0; i -= 8, sp += 8, dp += 8)
474 {
475 __m256i s1 = _mm256_load_si256((__m256i*)sp);
476 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
477 __m256i d = _mm256_load_si256((__m256i*)dp);
478 __m256i t = _mm256_add_epi32(s1, s2);
479 __m256i w = _mm256_srai_epi32(t, e);
480 d = _mm256_sub_epi32(d, w);
481 _mm256_store_si256((__m256i*)dp, d);
482 }
483 else
484 for (; i > 0; i -= 8, sp += 8, dp += 8)
485 {
486 __m256i s1 = _mm256_load_si256((__m256i*)sp);
487 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
488 __m256i d = _mm256_load_si256((__m256i*)dp);
489 __m256i t = _mm256_add_epi32(s1, s2);
490 __m256i w = _mm256_srai_epi32(t, e);
491 d = _mm256_sub_epi32(d, w);
492 _mm256_store_si256((__m256i*)dp, d);
493 }
494 }
495 else if (a == -1)
496 { // any case with a == -1, which is not 5/3 predict
497 int i = (int)h_width;
498 if (even)
499 for (; i > 0; i -= 8, sp += 8, dp += 8)
500 {
501 __m256i s1 = _mm256_load_si256((__m256i*)sp);
502 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
503 __m256i d = _mm256_load_si256((__m256i*)dp);
504 __m256i t = _mm256_add_epi32(s1, s2);
505 __m256i v = _mm256_sub_epi32(vb, t);
506 __m256i w = _mm256_srai_epi32(v, e);
507 d = _mm256_add_epi32(d, w);
508 _mm256_store_si256((__m256i*)dp, d);
509 }
510 else
511 for (; i > 0; i -= 8, sp += 8, dp += 8)
512 {
513 __m256i s1 = _mm256_load_si256((__m256i*)sp);
514 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
515 __m256i d = _mm256_load_si256((__m256i*)dp);
516 __m256i t = _mm256_add_epi32(s1, s2);
517 __m256i v = _mm256_sub_epi32(vb, t);
518 __m256i w = _mm256_srai_epi32(v, e);
519 d = _mm256_add_epi32(d, w);
520 _mm256_store_si256((__m256i*)dp, d);
521 }
522 }
523 else {
524 // general case
525 int i = (int)h_width;
526 if (even)
527 for (; i > 0; i -= 8, sp += 8, dp += 8)
528 {
529 __m256i s1 = _mm256_load_si256((__m256i*)sp);
530 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
531 __m256i d = _mm256_load_si256((__m256i*)dp);
532 __m256i t = _mm256_add_epi32(s1, s2);
533 __m256i u = _mm256_mullo_epi32(va, t);
534 __m256i v = _mm256_add_epi32(vb, u);
535 __m256i w = _mm256_srai_epi32(v, e);
536 d = _mm256_add_epi32(d, w);
537 _mm256_store_si256((__m256i*)dp, d);
538 }
539 else
540 for (; i > 0; i -= 8, sp += 8, dp += 8)
541 {
542 __m256i s1 = _mm256_load_si256((__m256i*)sp);
543 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
544 __m256i d = _mm256_load_si256((__m256i*)dp);
545 __m256i t = _mm256_add_epi32(s1, s2);
546 __m256i u = _mm256_mullo_epi32(va, t);
547 __m256i v = _mm256_add_epi32(vb, u);
548 __m256i w = _mm256_srai_epi32(v, e);
549 d = _mm256_add_epi32(d, w);
550 _mm256_store_si256((__m256i*)dp, d);
551 }
552 }
553
554 // swap buffers
555 si32* t = lp; lp = hp; hp = t;
556 even = !even;
557 ui32 w = l_width; l_width = h_width; h_width = w;
558 }
559 }
560 else {
561 if (even)
562 ldst->i32[0] = src->i32[0];
563 else
564 hdst->i32[0] = src->i32[0] << 1;
565 }
566 }
567
569 static
570 void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
571 const line_buf* hdst, const line_buf* src,
572 ui32 width, bool even)
573 {
574 if (width > 1)
575 {
576 // split src into ldst and hdst
577 {
578 double* dpl = (double*)(even ? ldst->p : hdst->p);
579 double* dph = (double*)(even ? hdst->p : ldst->p);
580 double* sp = (double*)src->p;
581 int w = (int)width;
582 avx2_deinterleave64(dpl, dph, sp, w);
583 }
584
585 si64* hp = hdst->i64, * lp = ldst->i64;
586 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
587 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
588 ui32 num_steps = atk->get_num_steps();
589 for (ui32 j = num_steps; j > 0; --j)
590 {
591 // first lifting step
592 const lifting_step* s = atk->get_step(j - 1);
593 const si32 a = s->rev.Aatk;
594 const si32 b = s->rev.Batk;
595 const ui8 e = s->rev.Eatk;
596 __m256i vb = _mm256_set1_epi64x(b);
597 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
598
599 // extension
600 lp[-1] = lp[0];
601 lp[l_width] = lp[l_width - 1];
602 // lifting step
603 const si64* sp = lp;
604 si64* dp = hp;
605 if (a == 1)
606 { // 5/3 update and any case with a == 1
607 int i = (int)h_width;
608 if (even)
609 {
610 for (; i > 0; i -= 4, sp += 4, dp += 4)
611 {
612 __m256i s1 = _mm256_load_si256((__m256i*)sp);
613 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
614 __m256i d = _mm256_load_si256((__m256i*)dp);
615 __m256i t = _mm256_add_epi64(s1, s2);
616 __m256i v = _mm256_add_epi64(vb, t);
617 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
618 d = _mm256_add_epi64(d, w);
619 _mm256_store_si256((__m256i*)dp, d);
620 }
621 }
622 else
623 {
624 for (; i > 0; i -= 4, sp += 4, dp += 4)
625 {
626 __m256i s1 = _mm256_load_si256((__m256i*)sp);
627 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
628 __m256i d = _mm256_load_si256((__m256i*)dp);
629 __m256i t = _mm256_add_epi64(s1, s2);
630 __m256i v = _mm256_add_epi64(vb, t);
631 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
632 d = _mm256_add_epi64(d, w);
633 _mm256_store_si256((__m256i*)dp, d);
634 }
635 }
636 }
637 else if (a == -1 && b == 1 && e == 1)
638 { // 5/3 predict
639 int i = (int)h_width;
640 if (even)
641 for (; i > 0; i -= 4, sp += 4, dp += 4)
642 {
643 __m256i s1 = _mm256_load_si256((__m256i*)sp);
644 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
645 __m256i d = _mm256_load_si256((__m256i*)dp);
646 __m256i t = _mm256_add_epi64(s1, s2);
647 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
648 d = _mm256_sub_epi64(d, w);
649 _mm256_store_si256((__m256i*)dp, d);
650 }
651 else
652 for (; i > 0; i -= 4, sp += 4, dp += 4)
653 {
654 __m256i s1 = _mm256_load_si256((__m256i*)sp);
655 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
656 __m256i d = _mm256_load_si256((__m256i*)dp);
657 __m256i t = _mm256_add_epi64(s1, s2);
658 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
659 d = _mm256_sub_epi64(d, w);
660 _mm256_store_si256((__m256i*)dp, d);
661 }
662 }
663 else if (a == -1)
664 { // any case with a == -1, which is not 5/3 predict
665 int i = (int)h_width;
666 if (even)
667 for (; i > 0; i -= 4, sp += 4, dp += 4)
668 {
669 __m256i s1 = _mm256_load_si256((__m256i*)sp);
670 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
671 __m256i d = _mm256_load_si256((__m256i*)dp);
672 __m256i t = _mm256_add_epi64(s1, s2);
673 __m256i v = _mm256_sub_epi64(vb, t);
674 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
675 d = _mm256_add_epi64(d, w);
676 _mm256_store_si256((__m256i*)dp, d);
677 }
678 else
679 for (; i > 0; i -= 4, sp += 4, dp += 4)
680 {
681 __m256i s1 = _mm256_load_si256((__m256i*)sp);
682 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
683 __m256i d = _mm256_load_si256((__m256i*)dp);
684 __m256i t = _mm256_add_epi64(s1, s2);
685 __m256i v = _mm256_sub_epi64(vb, t);
686 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
687 d = _mm256_add_epi64(d, w);
688 _mm256_store_si256((__m256i*)dp, d);
689 }
690 }
691 else {
692 // general case
693 // 64bit multiplication is not supported in avx2;
694 // in particular, _mm256_mullo_epi64.
695 if (even)
696 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
697 *dp += (b + a * (sp[0] + sp[1])) >> e;
698 else
699 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
700 *dp += (b + a * (sp[-1] + sp[0])) >> e;
701 }
702
703 // swap buffers
704 si64* t = lp; lp = hp; hp = t;
705 even = !even;
706 ui32 w = l_width; l_width = h_width; h_width = w;
707 }
708 }
709 else {
710 if (even)
711 ldst->i64[0] = src->i64[0];
712 else
713 hdst->i64[0] = src->i64[0] << 1;
714 }
715 }
716
718 void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
719 const line_buf* hdst, const line_buf* src,
720 ui32 width, bool even)
721 {
722 if (src->flags & line_buf::LFT_32BIT)
723 {
724 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
725 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
726 avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
727 }
728 else
729 {
730 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
731 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
732 (src == NULL || src->flags & line_buf::LFT_64BIT));
733 avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
734 }
735 }
736
738 static
739 void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
740 const line_buf* lsrc, const line_buf* hsrc,
741 ui32 width, bool even)
742 {
743 if (width > 1)
744 {
745 bool ev = even;
746 si32* oth = hsrc->i32, * aug = lsrc->i32;
747 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
748 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
749 ui32 num_steps = atk->get_num_steps();
750 for (ui32 j = 0; j < num_steps; ++j)
751 {
752 const lifting_step* s = atk->get_step(j);
753 const si32 a = s->rev.Aatk;
754 const si32 b = s->rev.Batk;
755 const ui8 e = s->rev.Eatk;
756 __m256i va = _mm256_set1_epi32(a);
757 __m256i vb = _mm256_set1_epi32(b);
758
759 // extension
760 oth[-1] = oth[0];
761 oth[oth_width] = oth[oth_width - 1];
762 // lifting step
763 const si32* sp = oth;
764 si32* dp = aug;
765 if (a == 1)
766 { // 5/3 update and any case with a == 1
767 int i = (int)aug_width;
768 if (ev)
769 {
770 for (; i > 0; i -= 8, sp += 8, dp += 8)
771 {
772 __m256i s1 = _mm256_load_si256((__m256i*)sp);
773 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
774 __m256i d = _mm256_load_si256((__m256i*)dp);
775 __m256i t = _mm256_add_epi32(s1, s2);
776 __m256i v = _mm256_add_epi32(vb, t);
777 __m256i w = _mm256_srai_epi32(v, e);
778 d = _mm256_sub_epi32(d, w);
779 _mm256_store_si256((__m256i*)dp, d);
780 }
781 }
782 else
783 {
784 for (; i > 0; i -= 8, sp += 8, dp += 8)
785 {
786 __m256i s1 = _mm256_load_si256((__m256i*)sp);
787 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
788 __m256i d = _mm256_load_si256((__m256i*)dp);
789 __m256i t = _mm256_add_epi32(s1, s2);
790 __m256i v = _mm256_add_epi32(vb, t);
791 __m256i w = _mm256_srai_epi32(v, e);
792 d = _mm256_sub_epi32(d, w);
793 _mm256_store_si256((__m256i*)dp, d);
794 }
795 }
796 }
797 else if (a == -1 && b == 1 && e == 1)
798 { // 5/3 predict
799 int i = (int)aug_width;
800 if (ev)
801 for (; i > 0; i -= 8, sp += 8, dp += 8)
802 {
803 __m256i s1 = _mm256_load_si256((__m256i*)sp);
804 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
805 __m256i d = _mm256_load_si256((__m256i*)dp);
806 __m256i t = _mm256_add_epi32(s1, s2);
807 __m256i w = _mm256_srai_epi32(t, e);
808 d = _mm256_add_epi32(d, w);
809 _mm256_store_si256((__m256i*)dp, d);
810 }
811 else
812 for (; i > 0; i -= 8, sp += 8, dp += 8)
813 {
814 __m256i s1 = _mm256_load_si256((__m256i*)sp);
815 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
816 __m256i d = _mm256_load_si256((__m256i*)dp);
817 __m256i t = _mm256_add_epi32(s1, s2);
818 __m256i w = _mm256_srai_epi32(t, e);
819 d = _mm256_add_epi32(d, w);
820 _mm256_store_si256((__m256i*)dp, d);
821 }
822 }
823 else if (a == -1)
824 { // any case with a == -1, which is not 5/3 predict
825 int i = (int)aug_width;
826 if (ev)
827 for (; i > 0; i -= 8, sp += 8, dp += 8)
828 {
829 __m256i s1 = _mm256_load_si256((__m256i*)sp);
830 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
831 __m256i d = _mm256_load_si256((__m256i*)dp);
832 __m256i t = _mm256_add_epi32(s1, s2);
833 __m256i v = _mm256_sub_epi32(vb, t);
834 __m256i w = _mm256_srai_epi32(v, e);
835 d = _mm256_sub_epi32(d, w);
836 _mm256_store_si256((__m256i*)dp, d);
837 }
838 else
839 for (; i > 0; i -= 8, sp += 8, dp += 8)
840 {
841 __m256i s1 = _mm256_load_si256((__m256i*)sp);
842 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
843 __m256i d = _mm256_load_si256((__m256i*)dp);
844 __m256i t = _mm256_add_epi32(s1, s2);
845 __m256i v = _mm256_sub_epi32(vb, t);
846 __m256i w = _mm256_srai_epi32(v, e);
847 d = _mm256_sub_epi32(d, w);
848 _mm256_store_si256((__m256i*)dp, d);
849 }
850 }
851 else {
852 // general case
853 int i = (int)aug_width;
854 if (ev)
855 for (; i > 0; i -= 8, sp += 8, dp += 8)
856 {
857 __m256i s1 = _mm256_load_si256((__m256i*)sp);
858 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
859 __m256i d = _mm256_load_si256((__m256i*)dp);
860 __m256i t = _mm256_add_epi32(s1, s2);
861 __m256i u = _mm256_mullo_epi32(va, t);
862 __m256i v = _mm256_add_epi32(vb, u);
863 __m256i w = _mm256_srai_epi32(v, e);
864 d = _mm256_sub_epi32(d, w);
865 _mm256_store_si256((__m256i*)dp, d);
866 }
867 else
868 for (; i > 0; i -= 8, sp += 8, dp += 8)
869 {
870 __m256i s1 = _mm256_load_si256((__m256i*)sp);
871 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
872 __m256i d = _mm256_load_si256((__m256i*)dp);
873 __m256i t = _mm256_add_epi32(s1, s2);
874 __m256i u = _mm256_mullo_epi32(va, t);
875 __m256i v = _mm256_add_epi32(vb, u);
876 __m256i w = _mm256_srai_epi32(v, e);
877 d = _mm256_sub_epi32(d, w);
878 _mm256_store_si256((__m256i*)dp, d);
879 }
880 }
881
882 // swap buffers
883 si32* t = aug; aug = oth; oth = t;
884 ev = !ev;
885 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
886 }
887
888 // combine both lsrc and hsrc into dst
889 {
890 float* dp = dst->f32;
891 float* spl = even ? lsrc->f32 : hsrc->f32;
892 float* sph = even ? hsrc->f32 : lsrc->f32;
893 int w = (int)width;
894 avx2_interleave32(dp, spl, sph, w);
895 }
896 }
897 else {
898 if (even)
899 dst->i32[0] = lsrc->i32[0];
900 else
901 dst->i32[0] = hsrc->i32[0] >> 1;
902 }
903 }
904
906 static
907 void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
908 const line_buf* lsrc, const line_buf* hsrc,
909 ui32 width, bool even)
910 {
911 if (width > 1)
912 {
913 bool ev = even;
914 si64* oth = hsrc->i64, * aug = lsrc->i64;
915 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
916 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
917 ui32 num_steps = atk->get_num_steps();
918 for (ui32 j = 0; j < num_steps; ++j)
919 {
920 const lifting_step* s = atk->get_step(j);
921 const si32 a = s->rev.Aatk;
922 const si32 b = s->rev.Batk;
923 const ui8 e = s->rev.Eatk;
924 __m256i vb = _mm256_set1_epi64x(b);
925 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
926
927 // extension
928 oth[-1] = oth[0];
929 oth[oth_width] = oth[oth_width - 1];
930 // lifting step
931 const si64* sp = oth;
932 si64* dp = aug;
933 if (a == 1)
934 { // 5/3 update and any case with a == 1
935 int i = (int)aug_width;
936 if (ev)
937 {
938 for (; i > 0; i -= 4, sp += 4, dp += 4)
939 {
940 __m256i s1 = _mm256_load_si256((__m256i*)sp);
941 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
942 __m256i d = _mm256_load_si256((__m256i*)dp);
943 __m256i t = _mm256_add_epi64(s1, s2);
944 __m256i v = _mm256_add_epi64(vb, t);
945 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
946 d = _mm256_sub_epi64(d, w);
947 _mm256_store_si256((__m256i*)dp, d);
948 }
949 }
950 else
951 {
952 for (; i > 0; i -= 4, sp += 4, dp += 4)
953 {
954 __m256i s1 = _mm256_load_si256((__m256i*)sp);
955 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
956 __m256i d = _mm256_load_si256((__m256i*)dp);
957 __m256i t = _mm256_add_epi64(s1, s2);
958 __m256i v = _mm256_add_epi64(vb, t);
959 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
960 d = _mm256_sub_epi64(d, w);
961 _mm256_store_si256((__m256i*)dp, d);
962 }
963 }
964 }
965 else if (a == -1 && b == 1 && e == 1)
966 { // 5/3 predict
967 int i = (int)aug_width;
968 if (ev)
969 for (; i > 0; i -= 4, sp += 4, dp += 4)
970 {
971 __m256i s1 = _mm256_load_si256((__m256i*)sp);
972 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
973 __m256i d = _mm256_load_si256((__m256i*)dp);
974 __m256i t = _mm256_add_epi64(s1, s2);
975 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
976 d = _mm256_add_epi64(d, w);
977 _mm256_store_si256((__m256i*)dp, d);
978 }
979 else
980 for (; i > 0; i -= 4, sp += 4, dp += 4)
981 {
982 __m256i s1 = _mm256_load_si256((__m256i*)sp);
983 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
984 __m256i d = _mm256_load_si256((__m256i*)dp);
985 __m256i t = _mm256_add_epi64(s1, s2);
986 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
987 d = _mm256_add_epi64(d, w);
988 _mm256_store_si256((__m256i*)dp, d);
989 }
990 }
991 else if (a == -1)
992 { // any case with a == -1, which is not 5/3 predict
993 int i = (int)aug_width;
994 if (ev)
995 for (; i > 0; i -= 4, sp += 4, dp += 4)
996 {
997 __m256i s1 = _mm256_load_si256((__m256i*)sp);
998 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
999 __m256i d = _mm256_load_si256((__m256i*)dp);
1000 __m256i t = _mm256_add_epi64(s1, s2);
1001 __m256i v = _mm256_sub_epi64(vb, t);
1002 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
1003 d = _mm256_sub_epi64(d, w);
1004 _mm256_store_si256((__m256i*)dp, d);
1005 }
1006 else
1007 for (; i > 0; i -= 4, sp += 4, dp += 4)
1008 {
1009 __m256i s1 = _mm256_load_si256((__m256i*)sp);
1010 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
1011 __m256i d = _mm256_load_si256((__m256i*)dp);
1012 __m256i t = _mm256_add_epi64(s1, s2);
1013 __m256i v = _mm256_sub_epi64(vb, t);
1014 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
1015 d = _mm256_sub_epi64(d, w);
1016 _mm256_store_si256((__m256i*)dp, d);
1017 }
1018 }
1019 else {
1020 // general case
1021 // 64bit multiplication is not supported in avx2;
1022 // in particular, _mm_mullo_epi64.
1023 if (ev)
1024 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
1025 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
1026 else
1027 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
1028 *dp -= (b + a * (sp[0] + sp[1])) >> e;
1029 }
1030
1031 // swap buffers
1032 si64* t = aug; aug = oth; oth = t;
1033 ev = !ev;
1034 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1035 }
1036
1037 // combine both lsrc and hsrc into dst
1038 {
1039 double* dp = (double*)dst->p;
1040 double* spl = (double*)(even ? lsrc->p : hsrc->p);
1041 double* sph = (double*)(even ? hsrc->p : lsrc->p);
1042 int w = (int)width;
1043 avx2_interleave64(dp, spl, sph, w);
1044 }
1045 }
1046 else {
1047 if (even)
1048 dst->i64[0] = lsrc->i64[0];
1049 else
1050 dst->i64[0] = hsrc->i64[0] >> 1;
1051 }
1052 }
1053
1055 void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
1056 const line_buf* lsrc, const line_buf* hsrc,
1057 ui32 width, bool even)
1058 {
1059 if (dst->flags & line_buf::LFT_32BIT)
1060 {
1061 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
1062 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
1063 avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1064 }
1065 else
1066 {
1067 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
1068 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
1069 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
1070 avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
1071 }
1072 }
1073
1074 } // !local
1075} // !ojph
float * f32
Definition ojph_mem.h:162
static void avx2_interleave64(double *dp, double *spl, double *sph, int width)
static void avx2_deinterleave32(float *dpl, float *dph, float *sp, int width)
static void avx2_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void avx2_interleave32(float *dp, float *spl, float *sph, int width)
static void avx2_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void avx2_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void avx2_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void avx2_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
static void avx2_deinterleave64(double *dpl, double *dph, double *sp, int width)
static void avx2_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
const lifting_step * get_step(ui32 s) const