OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_avx.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_avx.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include <cstdio>
39#include <immintrin.h>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_params.h"
46
47#include "ojph_transform.h"
49
50namespace ojph {
51 namespace local {
52
54 static inline void avx_multiply_const(float* p, float f, int width)
55 {
56 __m256 factor = _mm256_set1_ps(f);
57 for (; width > 0; width -= 8, p += 8)
58 {
59 __m256 s = _mm256_load_ps(p);
60 _mm256_store_ps(p, _mm256_mul_ps(factor, s));
61 }
62 }
63
65 static inline
66 void avx_deinterleave32(float* dpl, float* dph, float* sp, int width)
67 {
68 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
69 {
70 __m256 a = _mm256_load_ps(sp);
71 __m256 b = _mm256_load_ps(sp + 8);
72 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
73 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
74 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
75 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
76 _mm256_store_ps(dpl, e);
77 _mm256_store_ps(dph, f);
78 }
79 }
80
82 static inline
83 void avx_interleave32(float* dp, float* spl, float* sph, int width)
84 {
85 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
86 {
87 __m256 a = _mm256_load_ps(spl);
88 __m256 b = _mm256_load_ps(sph);
89 __m256 c = _mm256_unpacklo_ps(a, b);
90 __m256 d = _mm256_unpackhi_ps(a, b);
91 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
92 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
93 _mm256_store_ps(dp, e);
94 _mm256_store_ps(dp + 8, f);
95 }
96 }
97
99 void avx_irv_vert_step(const lifting_step* s, const line_buf* sig,
100 const line_buf* other, const line_buf* aug,
101 ui32 repeat, bool synthesis)
102 {
103 float a = s->irv.Aatk;
104 if (synthesis)
105 a = -a;
106
107 __m256 factor = _mm256_set1_ps(a);
108
109 float* dst = aug->f32;
110 const float* src1 = sig->f32, * src2 = other->f32;
111 int i = (int)repeat;
112 for ( ; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
113 {
114 __m256 s1 = _mm256_load_ps(src1);
115 __m256 s2 = _mm256_load_ps(src2);
116 __m256 d = _mm256_load_ps(dst);
117 d = _mm256_add_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
118 _mm256_store_ps(dst, d);
119 }
120 }
121
123 void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
124 {
125 avx_multiply_const(aug->f32, K, (int)repeat);
126 }
127
129 void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
130 const line_buf* hdst, const line_buf* src,
131 ui32 width, bool even)
132 {
133 if (width > 1)
134 {
135 // split src into ldst and hdst
136 {
137 float* dpl = even ? ldst->f32 : hdst->f32;
138 float* dph = even ? hdst->f32 : ldst->f32;
139 float* sp = src->f32;
140 int w = (int)width;
141 avx_deinterleave32(dpl, dph, sp, w);
142 }
143
144 // the actual horizontal transform
145 float* hp = hdst->f32, * lp = ldst->f32;
146 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
147 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
148 ui32 num_steps = atk->get_num_steps();
149 for (ui32 j = num_steps; j > 0; --j)
150 {
151 const lifting_step* s = atk->get_step(j - 1);
152 const float a = s->irv.Aatk;
153
154 // extension
155 lp[-1] = lp[0];
156 lp[l_width] = lp[l_width - 1];
157 // lifting step
158 const float* sp = lp;
159 float* dp = hp;
160 int i = (int)h_width;
161 __m256 f = _mm256_set1_ps(a);
162 if (even)
163 {
164 for (; i > 0; i -= 8, sp += 8, dp += 8)
165 {
166 __m256 m = _mm256_load_ps(sp);
167 __m256 n = _mm256_loadu_ps(sp + 1);
168 __m256 p = _mm256_load_ps(dp);
169 p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
170 _mm256_store_ps(dp, p);
171 }
172 }
173 else
174 {
175 for (; i > 0; i -= 8, sp += 8, dp += 8)
176 {
177 __m256 m = _mm256_load_ps(sp);
178 __m256 n = _mm256_loadu_ps(sp - 1);
179 __m256 p = _mm256_load_ps(dp);
180 p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
181 _mm256_store_ps(dp, p);
182 }
183 }
184
185 // swap buffers
186 float* t = lp; lp = hp; hp = t;
187 even = !even;
188 ui32 w = l_width; l_width = h_width; h_width = w;
189 }
190
191 { // multiply by K or 1/K
192 float K = atk->get_K();
193 float K_inv = 1.0f / K;
194 avx_multiply_const(lp, K_inv, (int)l_width);
195 avx_multiply_const(hp, K, (int)h_width);
196 }
197 }
198 else {
199 if (even)
200 ldst->f32[0] = src->f32[0];
201 else
202 hdst->f32[0] = src->f32[0] * 2.0f;
203 }
204 }
205
207 void avx_irv_horz_syn(const param_atk* atk, const line_buf* dst,
208 const line_buf* lsrc, const line_buf* hsrc,
209 ui32 width, bool even)
210 {
211 if (width > 1)
212 {
213 bool ev = even;
214 float* oth = hsrc->f32, * aug = lsrc->f32;
215 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
216 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
217
218 { // multiply by K or 1/K
219 float K = atk->get_K();
220 float K_inv = 1.0f / K;
221 avx_multiply_const(aug, K, (int)aug_width);
222 avx_multiply_const(oth, K_inv, (int)oth_width);
223 }
224
225 // the actual horizontal transform
226 ui32 num_steps = atk->get_num_steps();
227 for (ui32 j = 0; j < num_steps; ++j)
228 {
229 const lifting_step* s = atk->get_step(j);
230 const float a = s->irv.Aatk;
231
232 // extension
233 oth[-1] = oth[0];
234 oth[oth_width] = oth[oth_width - 1];
235 // lifting step
236 const float* sp = oth;
237 float* dp = aug;
238 int i = (int)aug_width;
239 __m256 f = _mm256_set1_ps(a);
240 if (ev)
241 {
242 for (; i > 0; i -= 8, sp += 8, dp += 8)
243 {
244 __m256 m = _mm256_load_ps(sp);
245 __m256 n = _mm256_loadu_ps(sp - 1);
246 __m256 p = _mm256_load_ps(dp);
247 p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
248 _mm256_store_ps(dp, p);
249 }
250 }
251 else
252 {
253 for (; i > 0; i -= 8, sp += 8, dp += 8)
254 {
255 __m256 m = _mm256_load_ps(sp);
256 __m256 n = _mm256_loadu_ps(sp + 1);
257 __m256 p = _mm256_load_ps(dp);
258 p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
259 _mm256_store_ps(dp, p);
260 }
261 }
262
263 // swap buffers
264 float* t = aug; aug = oth; oth = t;
265 ev = !ev;
266 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
267 }
268
269 // combine both lsrc and hsrc into dst
270 {
271 float* dp = dst->f32;
272 float* spl = even ? lsrc->f32 : hsrc->f32;
273 float* sph = even ? hsrc->f32 : lsrc->f32;
274 int w = (int)width;
275 avx_interleave32(dp, spl, sph, w);
276 }
277 }
278 else {
279 if (even)
280 dst->f32[0] = lsrc->f32[0];
281 else
282 dst->f32[0] = hsrc->f32[0] * 0.5f;
283 }
284 }
285
286 } // !local
287} // !ojph
float * f32
Definition ojph_mem.h:162
static void avx_multiply_const(float *p, float f, int width)
static void avx_deinterleave32(float *dpl, float *dph, float *sp, int width)
void avx_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void avx_interleave32(float *dp, float *spl, float *sph, int width)
void avx_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
uint32_t ui32
Definition ojph_defs.h:54
const lifting_step * get_step(ui32 s) const