OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_colour_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_avx2.cpp
34// Author: Aous Naman
35// Date: 11 October 2019
36//***************************************************************************/
37
38#include <climits>
39#include <cmath>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_colour.h"
45
46#include <immintrin.h>
47
48namespace ojph {
49 namespace local {
50
52 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
53 static inline
54 __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
55 {
56 // note than m must be obtained using
57 // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
58 __m256i x = _mm256_srli_epi64(a, amt);
59 x = _mm256_xor_si256(x, m);
60 __m256i result = _mm256_sub_epi64(x, m);
61 return result;
62 }
63
65 void avx2_rev_convert(const line_buf *src_line,
66 const ui32 src_line_offset,
67 line_buf *dst_line,
68 const ui32 dst_line_offset,
69 si64 shift, ui32 width)
70 {
71 if (src_line->flags & line_buf::LFT_32BIT)
72 {
73 if (dst_line->flags & line_buf::LFT_32BIT)
74 {
75 const si32 *sp = src_line->i32 + src_line_offset;
76 si32 *dp = dst_line->i32 + dst_line_offset;
77 __m256i sh = _mm256_set1_epi32((si32)shift);
78 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
79 {
80 __m256i s = _mm256_loadu_si256((__m256i*)sp);
81 s = _mm256_add_epi32(s, sh);
82 _mm256_storeu_si256((__m256i*)dp, s);
83 }
84 }
85 else
86 {
87 const si32 *sp = src_line->i32 + src_line_offset;
88 si64 *dp = dst_line->i64 + dst_line_offset;
89 __m256i sh = _mm256_set1_epi64x(shift);
90 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
91 {
92 __m256i s, t;
93 s = _mm256_loadu_si256((__m256i*)sp);
94
95 t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0));
96 t = _mm256_add_epi64(t, sh);
97 _mm256_storeu_si256((__m256i*)dp, t);
98
99 t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1));
100 t = _mm256_add_epi64(t, sh);
101 _mm256_storeu_si256((__m256i*)dp + 1, t);
102 }
103 }
104 }
105 else
106 {
107 assert(src_line->flags | line_buf::LFT_64BIT);
108 assert(dst_line->flags | line_buf::LFT_32BIT);
109 const si64 *sp = src_line->i64 + src_line_offset;
110 si32 *dp = dst_line->i32 + dst_line_offset;
111 __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
112 0, (si64)ULLONG_MAX);
113 __m256i sh = _mm256_set1_epi64x(shift);
114 for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
115 {
116 __m256i s, t;
117 s = _mm256_loadu_si256((__m256i*)sp);
118 s = _mm256_add_epi64(s, sh);
119
120 t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
121 t = _mm256_and_si256(low_bits, t);
122
123 s = _mm256_loadu_si256((__m256i*)sp + 1);
124 s = _mm256_add_epi64(s, sh);
125
126 s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
127 s = _mm256_andnot_si256(low_bits, s);
128
129 t = _mm256_or_si256(s, t);
130 t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
131 _mm256_storeu_si256((__m256i*)dp, t);
132 }
133 }
134 }
135
138 const ui32 src_line_offset,
139 line_buf *dst_line,
140 const ui32 dst_line_offset,
141 si64 shift, ui32 width)
142 {
143 if (src_line->flags & line_buf::LFT_32BIT)
144 {
145 if (dst_line->flags & line_buf::LFT_32BIT)
146 {
147 const si32 *sp = src_line->i32 + src_line_offset;
148 si32 *dp = dst_line->i32 + dst_line_offset;
149 __m256i sh = _mm256_set1_epi32((si32)(-shift));
150 __m256i zero = _mm256_setzero_si256();
151 for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
152 {
153 __m256i s = _mm256_loadu_si256((__m256i*)sp);
154 __m256i c = _mm256_cmpgt_epi32(zero, s); // 0xFFFFFFFF for -ve val
155 __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value
156 v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only -shift-val
157 s = _mm256_andnot_si256(c, s); // keep only +ve or 0
158 s = _mm256_or_si256(s, v_m_sh); // combine
159 _mm256_storeu_si256((__m256i*)dp, s);
160 }
161 }
162 else
163 {
164 const si32 *sp = src_line->i32 + src_line_offset;
165 si64 *dp = dst_line->i64 + dst_line_offset;
166 __m256i sh = _mm256_set1_epi64x(-shift);
167 __m256i zero = _mm256_setzero_si256();
168 for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
169 {
170 __m256i s, t, u0, u1, c, v_m_sh;
171 s = _mm256_loadu_si256((__m256i*)sp);
172
173 t = _mm256_cmpgt_epi32(zero, s); // find -ve 32bit -1
174 u0 = _mm256_unpacklo_epi32(s, t); // correct 64bit data
175 c = _mm256_unpacklo_epi32(t, t); // 64bit -1 for -ve value
176
177 v_m_sh = _mm256_sub_epi64(sh, u0); // - shift - value
178 v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
179 u0 = _mm256_andnot_si256(c, u0); // keep only +ve or 0
180 u0 = _mm256_or_si256(u0, v_m_sh); // combine
181
182 u1 = _mm256_unpackhi_epi32(s, t); // correct 64bit data
183 c = _mm256_unpackhi_epi32(t, t); // 64bit -1 for -ve value
184
185 v_m_sh = _mm256_sub_epi64(sh, u1); // - shift - value
186 v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
187 u1 = _mm256_andnot_si256(c, u1); // keep only +ve or 0
188 u1 = _mm256_or_si256(u1, v_m_sh); // combine
189
190 t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0);
191 _mm256_storeu_si256((__m256i*)dp, t);
192
193 t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1);
194 _mm256_storeu_si256((__m256i*)dp + 1, t);
195 }
196 }
197 }
198 else
199 {
200 assert(src_line->flags | line_buf::LFT_64BIT);
201 assert(dst_line->flags | line_buf::LFT_32BIT);
202 const si64 *sp = src_line->i64 + src_line_offset;
203 si32 *dp = dst_line->i32 + dst_line_offset;
204 __m256i sh = _mm256_set1_epi64x(-shift);
205 __m256i zero = _mm256_setzero_si256();
206 __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
207 0, (si64)ULLONG_MAX);
208 for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
209 {
210 // s for source, t for target, p for positive, n for negative,
211 // m for mask, and tm for temp
212 __m256i s, t, p, n, m, tm;
213 s = _mm256_loadu_si256((__m256i*)sp);
214
215 m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value
216 tm = _mm256_sub_epi64(sh, s); // - shift - value
217 n = _mm256_and_si256(m, tm); // -ve
218 p = _mm256_andnot_si256(m, s); // +ve
219 tm = _mm256_or_si256(n, p);
220 tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
221 t = _mm256_and_si256(half_mask, tm);
222
223 s = _mm256_loadu_si256((__m256i*)sp + 1);
224 m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value
225 tm = _mm256_sub_epi64(sh, s); // - shift - value
226 n = _mm256_and_si256(m, tm); // -ve
227 p = _mm256_andnot_si256(m, s); // +ve
228 tm = _mm256_or_si256(n, p);
229 tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
230 tm = _mm256_andnot_si256(half_mask, tm);
231
232 t = _mm256_or_si256(t, tm);
233 t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
234 _mm256_storeu_si256((__m256i*)dp, t);
235 }
236 }
237 }
238
240 static inline
241 __m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
242 {
243 // We must use _CMP_NLT_UQ or _CMP_GE_OQ, _CMP_GE_OS, or _CMP_NLT_US
244 // It is not clear to me which to use
245 __m256 ct = _mm256_cmp_ps(x, y, _CMP_NLT_UQ); // 0xFFFFFFFF for x >= y
246 __m256i c = _mm256_castps_si256(ct); // does not generate any code
247 __m256i d = _mm256_and_si256(c, a); // keep only a, where x >= y
248 __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x < y
249 return _mm256_or_si256(d, e); // combine
250 }
251
253 static inline
254 __m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
255 {
256 // We must use _CMP_LT_OQ or _CMP_NGE_UQ, _CMP_LT_OS, or _CMP_NGE_US
257 // It is not clear to me which to use
258 __m256 ct = _mm256_cmp_ps(x, y, _CMP_NGE_UQ); // 0xFFFFFFFF for x < y
259 __m256i c = _mm256_castps_si256(ct); // does not generate any code
260 __m256i d = _mm256_and_si256(c, a); // keep only a, where x < y
261 __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x >= y
262 return _mm256_or_si256(d, e); // combine
263 }
264
266 template<bool NLT_TYPE3>
267 static inline
269 line_buf *dst_line, ui32 dst_line_offset,
270 ui32 bit_depth, bool is_signed, ui32 width)
271 {
272 assert((src_line->flags & line_buf::LFT_32BIT) &&
273 (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
274 (dst_line->flags & line_buf::LFT_32BIT) &&
275 (dst_line->flags & line_buf::LFT_INTEGER));
276
277 assert(bit_depth <= 32);
278 const float* sp = src_line->f32;
279 si32* dp = dst_line->i32 + dst_line_offset;
280 // There is the possibility that converting to integer will
281 // exceed the dynamic range of 32bit integer; therefore, care must be
282 // exercised.
283 // We look if the floating point number is outside the half-closed
284 // interval [-0.5f, 0.5f). If so, we limit the resulting integer
285 // to the maximum/minimum that number supports.
286 si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
287 __m256 mul = _mm256_set1_ps((float)(1ull << bit_depth));
288 __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit); // val < upper
289 __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit); // val >= lower
290 __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
291 __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
292
293 if (is_signed)
294 {
295 __m256i zero = _mm256_setzero_si256();
296 __m256i bias =
297 _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
298 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
299 __m256 t = _mm256_loadu_ps(sp);
300 t = _mm256_mul_ps(t, mul);
301 __m256i u = _mm256_cvtps_epi32(t);
302 u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
303 u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
304 if (NLT_TYPE3)
305 {
306 __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val
307 __m256i neg = _mm256_sub_epi32(bias, u); // -bias -value
308 neg = _mm256_and_si256(c, neg); // keep only - bias - val
309 u = _mm256_andnot_si256(c, u); // keep only +ve or 0
310 u = _mm256_or_si256(neg, u); // combine
311 }
312 _mm256_storeu_si256((__m256i*)dp, u);
313 }
314 }
315 else
316 {
317 __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
318 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
319 __m256 t = _mm256_loadu_ps(sp);
320 t = _mm256_mul_ps(t, mul);
321 __m256i u = _mm256_cvtps_epi32(t);
322 u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
323 u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
324 u = _mm256_add_epi32(u, half);
325 _mm256_storeu_si256((__m256i*)dp, u);
326 }
327 }
328 }
329
332 line_buf *dst_line, ui32 dst_line_offset,
333 ui32 bit_depth, bool is_signed, ui32 width)
334 {
336 dst_line_offset, bit_depth, is_signed, width);
337 }
338
341 line_buf *dst_line, ui32 dst_line_offset,
342 ui32 bit_depth, bool is_signed, ui32 width)
343 {
344 local_avx2_irv_convert_to_integer<true>(src_line, dst_line,
345 dst_line_offset, bit_depth, is_signed, width);
346 }
347
349 template<bool NLT_TYPE3>
350 static inline
352 ui32 src_line_offset, line_buf *dst_line,
353 ui32 bit_depth, bool is_signed, ui32 width)
354 {
355 assert((src_line->flags & line_buf::LFT_32BIT) &&
356 (src_line->flags & line_buf::LFT_INTEGER) &&
357 (dst_line->flags & line_buf::LFT_32BIT) &&
358 (dst_line->flags & line_buf::LFT_INTEGER) == 0);
359
360 assert(bit_depth <= 32);
361 __m256 mul = _mm256_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
362
363 const si32* sp = src_line->i32 + src_line_offset;
364 float* dp = dst_line->f32;
365 if (is_signed)
366 {
367 __m256i zero = _mm256_setzero_si256();
368 __m256i bias =
369 _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
370 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
371 __m256i t = _mm256_loadu_si256((__m256i*)sp);
372 if (NLT_TYPE3)
373 {
374 __m256i c = _mm256_cmpgt_epi32(zero, t); // 0xFFFFFFFF for -ve val
375 __m256i neg = _mm256_sub_epi32(bias, t); // - bias - value
376 neg = _mm256_and_si256(c, neg); // keep only - bias - val
377 c = _mm256_andnot_si256(c, t); // keep only +ve or 0
378 t = _mm256_or_si256(neg, c); // combine
379 }
380 __m256 v = _mm256_cvtepi32_ps(t);
381 v = _mm256_mul_ps(v, mul);
382 _mm256_storeu_ps(dp, v);
383 }
384 }
385 else
386 {
387 __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
388 for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
389 __m256i t = _mm256_loadu_si256((__m256i*)sp);
390 t = _mm256_sub_epi32(t, half);
391 __m256 v = _mm256_cvtepi32_ps(t);
392 v = _mm256_mul_ps(v, mul);
393 _mm256_storeu_ps(dp, v);
394 }
395 }
396 }
397
400 ui32 src_line_offset, line_buf *dst_line,
401 ui32 bit_depth, bool is_signed, ui32 width)
402 {
403 local_avx2_irv_convert_to_float<false>(src_line, src_line_offset,
404 dst_line, bit_depth, is_signed, width);
405 }
406
409 ui32 src_line_offset, line_buf *dst_line,
410 ui32 bit_depth, bool is_signed, ui32 width)
411 {
412 local_avx2_irv_convert_to_float<true>(src_line, src_line_offset,
413 dst_line, bit_depth, is_signed, width);
414 }
415
416
419 const line_buf *g,
420 const line_buf *b,
421 line_buf *y, line_buf *cb, line_buf *cr,
422 ui32 repeat)
423 {
424 assert((y->flags & line_buf::LFT_INTEGER) &&
430
431 if (y->flags & line_buf::LFT_32BIT)
432 {
433 assert((y->flags & line_buf::LFT_32BIT) &&
434 (cb->flags & line_buf::LFT_32BIT) &&
435 (cr->flags & line_buf::LFT_32BIT) &&
436 (r->flags & line_buf::LFT_32BIT) &&
437 (g->flags & line_buf::LFT_32BIT) &&
439 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
440 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
441 for (int i = (repeat + 7) >> 3; i > 0; --i)
442 {
443 __m256i mr = _mm256_load_si256((__m256i*)rp);
444 __m256i mg = _mm256_load_si256((__m256i*)gp);
445 __m256i mb = _mm256_load_si256((__m256i*)bp);
446 __m256i t = _mm256_add_epi32(mr, mb);
447 t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
448 _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2));
449 t = _mm256_sub_epi32(mb, mg);
450 _mm256_store_si256((__m256i*)cbp, t);
451 t = _mm256_sub_epi32(mr, mg);
452 _mm256_store_si256((__m256i*)crp, t);
453
454 rp += 8; gp += 8; bp += 8;
455 yp += 8; cbp += 8; crp += 8;
456 }
457 }
458 else
459 {
460 assert((y->flags & line_buf::LFT_64BIT) &&
461 (cb->flags & line_buf::LFT_64BIT) &&
462 (cr->flags & line_buf::LFT_64BIT) &&
463 (r->flags & line_buf::LFT_32BIT) &&
464 (g->flags & line_buf::LFT_32BIT) &&
466 __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
467 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
468 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
469 for (int i = (repeat + 7) >> 3; i > 0; --i)
470 {
471 __m256i mr32 = _mm256_load_si256((__m256i*)rp);
472 __m256i mg32 = _mm256_load_si256((__m256i*)gp);
473 __m256i mb32 = _mm256_load_si256((__m256i*)bp);
474 __m256i mr, mg, mb, t;
475 mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0));
476 mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0));
477 mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0));
478
479 t = _mm256_add_epi64(mr, mb);
480 t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
481 _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
482 t = _mm256_sub_epi64(mb, mg);
483 _mm256_store_si256((__m256i*)cbp, t);
484 t = _mm256_sub_epi64(mr, mg);
485 _mm256_store_si256((__m256i*)crp, t);
486
487 yp += 4; cbp += 4; crp += 4;
488
489 mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1));
490 mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1));
491 mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1));
492
493 t = _mm256_add_epi64(mr, mb);
494 t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
495 _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
496 t = _mm256_sub_epi64(mb, mg);
497 _mm256_store_si256((__m256i*)cbp, t);
498 t = _mm256_sub_epi64(mr, mg);
499 _mm256_store_si256((__m256i*)crp, t);
500
501 rp += 8; gp += 8; bp += 8;
502 yp += 4; cbp += 4; crp += 4;
503 }
504 }
505 }
506
509 const line_buf *cb,
510 const line_buf *cr,
511 line_buf *r, line_buf *g, line_buf *b,
512 ui32 repeat)
513 {
514 assert((y->flags & line_buf::LFT_INTEGER) &&
520
521 if (y->flags & line_buf::LFT_32BIT)
522 {
523 assert((y->flags & line_buf::LFT_32BIT) &&
524 (cb->flags & line_buf::LFT_32BIT) &&
525 (cr->flags & line_buf::LFT_32BIT) &&
526 (r->flags & line_buf::LFT_32BIT) &&
527 (g->flags & line_buf::LFT_32BIT) &&
529 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
530 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
531 for (int i = (repeat + 7) >> 3; i > 0; --i)
532 {
533 __m256i my = _mm256_load_si256((__m256i*)yp);
534 __m256i mcb = _mm256_load_si256((__m256i*)cbp);
535 __m256i mcr = _mm256_load_si256((__m256i*)crp);
536
537 __m256i t = _mm256_add_epi32(mcb, mcr);
538 t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
539 _mm256_store_si256((__m256i*)gp, t);
540 __m256i u = _mm256_add_epi32(mcb, t);
541 _mm256_store_si256((__m256i*)bp, u);
542 u = _mm256_add_epi32(mcr, t);
543 _mm256_store_si256((__m256i*)rp, u);
544
545 yp += 8; cbp += 8; crp += 8;
546 rp += 8; gp += 8; bp += 8;
547 }
548 }
549 else
550 {
551 assert((y->flags & line_buf::LFT_64BIT) &&
552 (cb->flags & line_buf::LFT_64BIT) &&
553 (cr->flags & line_buf::LFT_64BIT) &&
554 (r->flags & line_buf::LFT_32BIT) &&
555 (g->flags & line_buf::LFT_32BIT) &&
557 __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
558 __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
559 0, (si64)ULLONG_MAX);
560 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
561 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
562 for (int i = (repeat + 7) >> 3; i > 0; --i)
563 {
564 __m256i my, mcb, mcr, tr, tg, tb;
565 my = _mm256_load_si256((__m256i*)yp);
566 mcb = _mm256_load_si256((__m256i*)cbp);
567 mcr = _mm256_load_si256((__m256i*)crp);
568
569 tg = _mm256_add_epi64(mcb, mcr);
570 tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
571 tb = _mm256_add_epi64(mcb, tg);
572 tr = _mm256_add_epi64(mcr, tg);
573
574 __m256i mr, mg, mb;
575 mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
576 mr = _mm256_and_si256(low_bits, mr);
577 mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
578 mg = _mm256_and_si256(low_bits, mg);
579 mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
580 mb = _mm256_and_si256(low_bits, mb);
581
582 yp += 4; cbp += 4; crp += 4;
583
584 my = _mm256_load_si256((__m256i*)yp);
585 mcb = _mm256_load_si256((__m256i*)cbp);
586 mcr = _mm256_load_si256((__m256i*)crp);
587
588 tg = _mm256_add_epi64(mcb, mcr);
589 tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
590 tb = _mm256_add_epi64(mcb, tg);
591 tr = _mm256_add_epi64(mcr, tg);
592
593 tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
594 tr = _mm256_andnot_si256(low_bits, tr);
595 mr = _mm256_or_si256(mr, tr);
596 mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0));
597
598 tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
599 tg = _mm256_andnot_si256(low_bits, tg);
600 mg = _mm256_or_si256(mg, tg);
601 mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0));
602
603 tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
604 tb = _mm256_andnot_si256(low_bits, tb);
605 mb = _mm256_or_si256(mb, tb);
606 mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0));
607
608 _mm256_store_si256((__m256i*)rp, mr);
609 _mm256_store_si256((__m256i*)gp, mg);
610 _mm256_store_si256((__m256i*)bp, mb);
611
612 yp += 4; cbp += 4; crp += 4;
613 rp += 8; gp += 8; bp += 8;
614 }
615 }
616 }
617
618 }
619}
float * f32
Definition ojph_mem.h:162
void avx2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
static __m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
void avx2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void avx2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void avx2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
static void local_avx2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
static __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
void avx2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
static __m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
static void local_avx2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54