33#ifndef REG_SAD_POW2_WIDTHS_SSE41_H_
34#define REG_SAD_POW2_WIDTHS_SSE41_H_
43static INLINE uint32_t reg_sad_w0(
const uint8_t *
const data1,
const uint8_t *
const data2,
44 const int32_t height,
const uint32_t stride1,
45 const uint32_t stride2)
50static INLINE uint32_t reg_sad_w4(
const uint8_t *
const data1,
const uint8_t *
const data2,
51 const int32_t height,
const uint32_t stride1,
52 const uint32_t stride2)
54 __m128i sse_inc = _mm_setzero_si128();
57 const int32_t height_fourline_groups = height & ~3;
58 const int32_t height_residual_lines = height & 3;
60 for (y = 0; y < height_fourline_groups; y += 4) {
61 __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(data1 + y * stride1));
62 __m128i b = _mm_cvtsi32_si128(*(uint32_t *)(data2 + y * stride2));
64 a = _mm_insert_epi32(a, *(
const uint32_t *)(data1 + (y + 1) * stride1), 1);
65 b = _mm_insert_epi32(b, *(
const uint32_t *)(data2 + (y + 1) * stride2), 1);
66 a = _mm_insert_epi32(a, *(
const uint32_t *)(data1 + (y + 2) * stride1), 2);
67 b = _mm_insert_epi32(b, *(
const uint32_t *)(data2 + (y + 2) * stride2), 2);
68 a = _mm_insert_epi32(a, *(
const uint32_t *)(data1 + (y + 3) * stride1), 3);
69 b = _mm_insert_epi32(b, *(
const uint32_t *)(data2 + (y + 3) * stride2), 3);
71 __m128i curr_sads = _mm_sad_epu8(a, b);
72 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
74 if (height_residual_lines) {
75 for (; y < height; y++) {
76 __m128i a = _mm_cvtsi32_si128(*(
const uint32_t *)(data1 + y * stride1));
77 __m128i b = _mm_cvtsi32_si128(*(
const uint32_t *)(data2 + y * stride2));
79 __m128i curr_sads = _mm_sad_epu8(a, b);
80 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
83 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
84 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
86 return _mm_cvtsi128_si32(sad);
89static INLINE uint32_t reg_sad_w8(
const uint8_t *
const data1,
const uint8_t *
const data2,
90 const int32_t height,
const uint32_t stride1,
91 const uint32_t stride2)
93 __m128i sse_inc = _mm_setzero_si128();
96 const int32_t height_fourline_groups = height & ~3;
97 const int32_t height_residual_lines = height & 3;
99 for (y = 0; y < height_fourline_groups; y += 4) {
100 __m128d a_d = _mm_setzero_pd();
101 __m128d b_d = _mm_setzero_pd();
102 __m128d c_d = _mm_setzero_pd();
103 __m128d d_d = _mm_setzero_pd();
105 a_d = _mm_loadl_pd(a_d, (
const double *)(data1 + (y + 0) * stride1));
106 b_d = _mm_loadl_pd(b_d, (
const double *)(data2 + (y + 0) * stride2));
107 a_d = _mm_loadh_pd(a_d, (
const double *)(data1 + (y + 1) * stride1));
108 b_d = _mm_loadh_pd(b_d, (
const double *)(data2 + (y + 1) * stride2));
110 c_d = _mm_loadl_pd(c_d, (
const double *)(data1 + (y + 2) * stride1));
111 d_d = _mm_loadl_pd(d_d, (
const double *)(data2 + (y + 2) * stride2));
112 c_d = _mm_loadh_pd(c_d, (
const double *)(data1 + (y + 3) * stride1));
113 d_d = _mm_loadh_pd(d_d, (
const double *)(data2 + (y + 3) * stride2));
115 __m128i a = _mm_castpd_si128(a_d);
116 __m128i b = _mm_castpd_si128(b_d);
117 __m128i c = _mm_castpd_si128(c_d);
118 __m128i d = _mm_castpd_si128(d_d);
120 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
121 __m128i curr_sads_cd = _mm_sad_epu8(c, d);
122 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
123 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
125 if (height_residual_lines) {
126 for (; y < height; y++) {
127 __m128i a = _mm_loadl_epi64((__m128i *)(data1 + y * stride1));
128 __m128i b = _mm_loadl_epi64((__m128i *)(data2 + y * stride2));
130 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
131 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
134 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
135 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
137 return _mm_cvtsi128_si32(sad);
140static INLINE uint32_t reg_sad_w12(
const uint8_t *
const data1,
const uint8_t *
const data2,
141 const int32_t height,
const uint32_t stride1,
142 const uint32_t stride2)
144 __m128i sse_inc = _mm_setzero_si128();
146 for (y = 0; y < height; y++) {
147 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + y * stride1));
148 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + y * stride2));
150 __m128i b_masked = _mm_blend_epi16(a, b, 0x3f);
151 __m128i curr_sads = _mm_sad_epu8 (a, b_masked);
152 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
154 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
155 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
156 return _mm_cvtsi128_si32(sad);
159static INLINE uint32_t reg_sad_w16(
const uint8_t *
const data1,
const uint8_t *
const data2,
160 const int32_t height,
const uint32_t stride1,
161 const uint32_t stride2)
163 __m128i sse_inc = _mm_setzero_si128();
166 const int32_t height_fourline_groups = height & ~3;
167 const int32_t height_residual_lines = height & 3;
169 for (y = 0; y < height_fourline_groups; y += 4) {
170 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + (y + 0) * stride1));
171 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + (y + 0) * stride2));
172 __m128i c = _mm_loadu_si128((
const __m128i *)(data1 + (y + 1) * stride1));
173 __m128i d = _mm_loadu_si128((
const __m128i *)(data2 + (y + 1) * stride2));
174 __m128i e = _mm_loadu_si128((
const __m128i *)(data1 + (y + 2) * stride1));
175 __m128i f = _mm_loadu_si128((
const __m128i *)(data2 + (y + 2) * stride2));
176 __m128i g = _mm_loadu_si128((
const __m128i *)(data1 + (y + 3) * stride1));
177 __m128i h = _mm_loadu_si128((
const __m128i *)(data2 + (y + 3) * stride2));
179 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
180 __m128i curr_sads_cd = _mm_sad_epu8(c, d);
181 __m128i curr_sads_ef = _mm_sad_epu8(e, f);
182 __m128i curr_sads_gh = _mm_sad_epu8(g, h);
184 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
185 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
186 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
187 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
189 if (height_residual_lines) {
190 for (; y < height; y++) {
191 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + (y + 0) * stride1));
192 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + (y + 0) * stride2));
194 __m128i curr_sads = _mm_sad_epu8(a, b);
195 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
199 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
200 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
201 return _mm_cvtsi128_si32(sad);
204static INLINE uint32_t reg_sad_w24(
const uint8_t *
const data1,
const uint8_t *
const data2,
205 const int32_t height,
const uint32_t stride1,
206 const uint32_t stride2)
208 __m128i sse_inc = _mm_setzero_si128();
211 const int32_t height_doublelines = height & ~1;
212 const int32_t height_parity = height & 1;
214 for (y = 0; y < height_doublelines; y += 2) {
215 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + (y + 0) * stride1));
216 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + (y + 0) * stride2));
217 __m128i c = _mm_loadu_si128((
const __m128i *)(data1 + (y + 1) * stride1));
218 __m128i d = _mm_loadu_si128((
const __m128i *)(data2 + (y + 1) * stride2));
220 __m128d e_d = _mm_setzero_pd();
221 __m128d f_d = _mm_setzero_pd();
223 e_d = _mm_loadl_pd(e_d, (
const double *)(data1 + (y + 0) * stride1 + 16));
224 f_d = _mm_loadl_pd(f_d, (
const double *)(data2 + (y + 0) * stride2 + 16));
225 e_d = _mm_loadh_pd(e_d, (
const double *)(data1 + (y + 1) * stride1 + 16));
226 f_d = _mm_loadh_pd(f_d, (
const double *)(data2 + (y + 1) * stride2 + 16));
228 __m128i e = _mm_castpd_si128(e_d);
229 __m128i f = _mm_castpd_si128(f_d);
231 __m128i curr_sads_1 = _mm_sad_epu8(a, b);
232 __m128i curr_sads_2 = _mm_sad_epu8(c, d);
233 __m128i curr_sads_3 = _mm_sad_epu8(e, f);
235 sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
236 sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
237 sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
240 __m128i a = _mm_loadu_si128 ((
const __m128i *)(data1 + y * stride1));
241 __m128i b = _mm_loadu_si128 ((
const __m128i *)(data2 + y * stride2));
242 __m128i c = _mm_loadl_epi64 ((
const __m128i *)(data1 + y * stride1 + 16));
243 __m128i d = _mm_loadl_epi64 ((
const __m128i *)(data2 + y * stride2 + 16));
245 __m128i curr_sads_1 = _mm_sad_epu8(a, b);
246 __m128i curr_sads_2 = _mm_sad_epu8(c, d);
248 sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
249 sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
251 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
252 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
253 return _mm_cvtsi128_si32(sad);
256static INLINE uint32_t reg_sad_arbitrary(
const uint8_t *
const data1,
const uint8_t *
const data2,
257 const int32_t width,
const int32_t height,
const uint32_t stride1,
258 const uint32_t stride2)
261 __m128i sse_inc = _mm_setzero_si128();
264 const int32_t width_xmms = width & ~15;
265 const int32_t width_residual_pixels = width & 15;
267 const int32_t height_fourline_groups = height & ~3;
268 const int32_t height_residual_lines = height & 3;
270 const __m128i rds = _mm_set1_epi8 (width_residual_pixels);
271 const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
272 8, 9, 10, 11, 12, 13, 14, 15);
273 const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
275 for (x = 0; x < width_xmms; x += 16) {
276 for (y = 0; y < height_fourline_groups; y += 4) {
277 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + (y + 0) * stride1 + x));
278 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + (y + 0) * stride2 + x));
279 __m128i c = _mm_loadu_si128((
const __m128i *)(data1 + (y + 1) * stride1 + x));
280 __m128i d = _mm_loadu_si128((
const __m128i *)(data2 + (y + 1) * stride2 + x));
281 __m128i e = _mm_loadu_si128((
const __m128i *)(data1 + (y + 2) * stride1 + x));
282 __m128i f = _mm_loadu_si128((
const __m128i *)(data2 + (y + 2) * stride2 + x));
283 __m128i g = _mm_loadu_si128((
const __m128i *)(data1 + (y + 3) * stride1 + x));
284 __m128i h = _mm_loadu_si128((
const __m128i *)(data2 + (y + 3) * stride2 + x));
286 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
287 __m128i curr_sads_cd = _mm_sad_epu8(c, d);
288 __m128i curr_sads_ef = _mm_sad_epu8(e, f);
289 __m128i curr_sads_gh = _mm_sad_epu8(g, h);
291 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
292 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
293 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
294 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
296 if (height_residual_lines) {
297 for (; y < height; y++) {
298 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + y * stride1 + x));
299 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + y * stride2 + x));
301 __m128i curr_sads = _mm_sad_epu8(a, b);
303 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
308 if (width_residual_pixels) {
309 for (y = 0; y < height_fourline_groups; y += 4) {
310 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + (y + 0) * stride1 + x));
311 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + (y + 0) * stride2 + x));
312 __m128i c = _mm_loadu_si128((
const __m128i *)(data1 + (y + 1) * stride1 + x));
313 __m128i d = _mm_loadu_si128((
const __m128i *)(data2 + (y + 1) * stride2 + x));
314 __m128i e = _mm_loadu_si128((
const __m128i *)(data1 + (y + 2) * stride1 + x));
315 __m128i f = _mm_loadu_si128((
const __m128i *)(data2 + (y + 2) * stride2 + x));
316 __m128i g = _mm_loadu_si128((
const __m128i *)(data1 + (y + 3) * stride1 + x));
317 __m128i h = _mm_loadu_si128((
const __m128i *)(data2 + (y + 3) * stride2 + x));
319 __m128i b_masked = _mm_blendv_epi8(a, b, rdmask);
320 __m128i d_masked = _mm_blendv_epi8(c, d, rdmask);
321 __m128i f_masked = _mm_blendv_epi8(e, f, rdmask);
322 __m128i h_masked = _mm_blendv_epi8(g, h, rdmask);
324 __m128i curr_sads_ab = _mm_sad_epu8 (a, b_masked);
325 __m128i curr_sads_cd = _mm_sad_epu8 (c, d_masked);
326 __m128i curr_sads_ef = _mm_sad_epu8 (e, f_masked);
327 __m128i curr_sads_gh = _mm_sad_epu8 (g, h_masked);
329 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
330 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
331 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
332 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
334 if (height_residual_lines) {
335 for (; y < height; y++) {
336 __m128i a = _mm_loadu_si128((
const __m128i *)(data1 + y * stride1 + x));
337 __m128i b = _mm_loadu_si128((
const __m128i *)(data2 + y * stride2 + x));
339 __m128i b_masked = _mm_blendv_epi8(a, b, rdmask);
340 __m128i curr_sads = _mm_sad_epu8 (a, b_masked);
342 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
346 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
347 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
349 return _mm_cvtsi128_si32(sad);
352static uint32_t ver_sad_w4(
const uint8_t *pic_data,
const uint8_t *ref_data,
353 int32_t height, uint32_t stride)
355 __m128i ref_row = _mm_set1_epi32(*(
const uint32_t *)ref_data);
356 __m128i sse_inc = _mm_setzero_si128();
359 const int32_t height_fourline_groups = height & ~3;
360 const int32_t height_residual_lines = height & 3;
362 for (y = 0; y < height_fourline_groups; y += 4) {
363 __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
365 a = _mm_insert_epi32(a, *(
const uint32_t *)(pic_data + (y + 1) * stride), 1);
366 a = _mm_insert_epi32(a, *(
const uint32_t *)(pic_data + (y + 2) * stride), 2);
367 a = _mm_insert_epi32(a, *(
const uint32_t *)(pic_data + (y + 3) * stride), 3);
369 __m128i curr_sads = _mm_sad_epu8(a, ref_row);
370 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
372 if (height_residual_lines) {
376 for (; y < height; y++) {
377 __m128i a = _mm_cvtsi32_si128(*(
const uint32_t *)(pic_data + y * stride));
379 __m128i curr_sads = _mm_sad_epu8(a, ref_row);
380 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
383 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
384 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
386 return _mm_cvtsi128_si32(sad);
389static uint32_t ver_sad_w8(
const uint8_t *pic_data,
const uint8_t *ref_data,
390 int32_t height, uint32_t stride)
392 const __m128i ref_row = _mm_set1_epi64x(*(
const uint64_t *)ref_data);
393 __m128i sse_inc = _mm_setzero_si128();
396 const int32_t height_fourline_groups = height & ~3;
397 const int32_t height_residual_lines = height & 3;
399 for (y = 0; y < height_fourline_groups; y += 4) {
400 __m128d a_d = _mm_setzero_pd();
401 __m128d c_d = _mm_setzero_pd();
403 a_d = _mm_loadl_pd(a_d, (
const double *)(pic_data + (y + 0) * stride));
404 a_d = _mm_loadh_pd(a_d, (
const double *)(pic_data + (y + 1) * stride));
406 c_d = _mm_loadl_pd(c_d, (
const double *)(pic_data + (y + 2) * stride));
407 c_d = _mm_loadh_pd(c_d, (
const double *)(pic_data + (y + 3) * stride));
409 __m128i a = _mm_castpd_si128(a_d);
410 __m128i c = _mm_castpd_si128(c_d);
412 __m128i curr_sads_ab = _mm_sad_epu8(a, ref_row);
413 __m128i curr_sads_cd = _mm_sad_epu8(c, ref_row);
414 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
415 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
417 if (height_residual_lines) {
418 __m128i b = _mm_move_epi64(ref_row);
420 for (; y < height; y++) {
421 __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * stride));
423 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
424 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
427 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
428 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
430 return _mm_cvtsi128_si32(sad);
433static uint32_t ver_sad_w12(
const uint8_t *pic_data,
const uint8_t *ref_data,
434 int32_t height, uint32_t stride)
436 const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
437 __m128i sse_inc = _mm_setzero_si128();
440 for (y = 0; y < height; y++) {
441 __m128i a = _mm_loadu_si128((
const __m128i *)(pic_data + y * stride));
443 __m128i a_masked = _mm_blend_epi16(ref_row, a, 0x3f);
444 __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
445 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
447 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
448 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
449 return _mm_cvtsi128_si32(sad);
452static uint32_t ver_sad_w16(
const uint8_t *pic_data,
const uint8_t *ref_data,
453 int32_t height, uint32_t stride)
455 const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
456 __m128i sse_inc = _mm_setzero_si128();
459 const int32_t height_fourline_groups = height & ~3;
460 const int32_t height_residual_lines = height & 3;
462 for (y = 0; y < height_fourline_groups; y += 4) {
463 __m128i pic_row_1 = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
464 __m128i pic_row_2 = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
465 __m128i pic_row_3 = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
466 __m128i pic_row_4 = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
468 __m128i curr_sads_1 = _mm_sad_epu8 (pic_row_1, ref_row);
469 __m128i curr_sads_2 = _mm_sad_epu8 (pic_row_2, ref_row);
470 __m128i curr_sads_3 = _mm_sad_epu8 (pic_row_3, ref_row);
471 __m128i curr_sads_4 = _mm_sad_epu8 (pic_row_4, ref_row);
473 sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
474 sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
475 sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
476 sse_inc = _mm_add_epi64(sse_inc, curr_sads_4);
478 if (height_residual_lines) {
479 for (; y < height; y++) {
480 __m128i pic_row = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
481 __m128i curr_sads = _mm_sad_epu8 (pic_row, ref_row);
483 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
486 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
487 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
489 return _mm_cvtsi128_si32(sad);
492static uint32_t ver_sad_arbitrary(
const uint8_t *pic_data,
const uint8_t *ref_data,
493 int32_t width, int32_t height, uint32_t stride)
496 __m128i sse_inc = _mm_setzero_si128();
499 const int32_t width_xmms = width & ~15;
500 const int32_t width_residual_pixels = width & 15;
502 const int32_t height_fourline_groups = height & ~3;
503 const int32_t height_residual_lines = height & 3;
505 const __m128i rds = _mm_set1_epi8 (width_residual_pixels);
506 const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
507 8, 9, 10, 11, 12, 13, 14, 15);
508 const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
510 for (x = 0; x < width_xmms; x += 16) {
511 const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
512 for (y = 0; y < height_fourline_groups; y += 4) {
513 __m128i a = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 0) * stride + x));
514 __m128i c = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 1) * stride + x));
515 __m128i e = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 2) * stride + x));
516 __m128i g = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 3) * stride + x));
518 __m128i curr_sads_ab = _mm_sad_epu8(ref_row, a);
519 __m128i curr_sads_cd = _mm_sad_epu8(ref_row, c);
520 __m128i curr_sads_ef = _mm_sad_epu8(ref_row, e);
521 __m128i curr_sads_gh = _mm_sad_epu8(ref_row, g);
523 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
524 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
525 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
526 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
528 if (height_residual_lines) {
529 for (; y < height; y++) {
530 __m128i a = _mm_loadu_si128((
const __m128i *)(pic_data + y * stride + x));
532 __m128i curr_sads = _mm_sad_epu8(a, ref_row);
534 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
539 if (width_residual_pixels) {
540 const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
541 for (y = 0; y < height_fourline_groups; y += 4) {
542 __m128i a = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 0) * stride + x));
543 __m128i c = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 1) * stride + x));
544 __m128i e = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 2) * stride + x));
545 __m128i g = _mm_loadu_si128((
const __m128i *)(pic_data + (y + 3) * stride + x));
547 __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
548 __m128i c_masked = _mm_blendv_epi8(ref_row, c, rdmask);
549 __m128i e_masked = _mm_blendv_epi8(ref_row, e, rdmask);
550 __m128i g_masked = _mm_blendv_epi8(ref_row, g, rdmask);
552 __m128i curr_sads_ab = _mm_sad_epu8 (ref_row, a_masked);
553 __m128i curr_sads_cd = _mm_sad_epu8 (ref_row, c_masked);
554 __m128i curr_sads_ef = _mm_sad_epu8 (ref_row, e_masked);
555 __m128i curr_sads_gh = _mm_sad_epu8 (ref_row, g_masked);
557 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
558 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
559 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
560 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
562 if (height_residual_lines) {
563 for (; y < height; y++) {
564 __m128i a = _mm_loadu_si128((
const __m128i *)(pic_data + y * stride + x));
566 __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
567 __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
569 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
573 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
574 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
576 return _mm_cvtsi128_si32(sad);
579static uint32_t hor_sad_sse41_w4(
const uint8_t *pic_data,
const uint8_t *ref_data,
580 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
581 uint32_t left, uint32_t right)
583 const int32_t right_border_idx = 3 - right;
584 const int32_t border_idx = left ? left : right_border_idx;
586 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
587 8, 9, 10, 11, 12, 13, 14, 15);
589 const int32_t border_idx_negative = border_idx >> 31;
590 const int32_t leftoff = border_idx_negative | left;
594 const __m128i dwbaseids = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4,
595 8, 8, 8, 8, 12, 12, 12, 12);
597 __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
598 __m128i left_128 = _mm_set1_epi8((int8_t)left);
600 right_border_idxs = _mm_add_epi8 (right_border_idxs, dwbaseids);
602 __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
603 __m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
605 const __m128i epol_mask = _mm_max_epi8(mask1, dwbaseids);
607 const int32_t height_fourline_groups = height & ~3;
608 const int32_t height_residual_lines = height & 3;
610 __m128i sse_inc = _mm_setzero_si128();
612 for (y = 0; y < height_fourline_groups; y += 4) {
613 __m128i a = _mm_cvtsi32_si128(*(
const uint32_t *)(pic_data + y * pic_stride));
614 __m128i b = _mm_cvtsi32_si128(*(
const uint32_t *)(ref_data + y * ref_stride + leftoff));
616 a = _mm_insert_epi32(a, *(
const uint32_t *)(pic_data + (y + 1) * pic_stride), 1);
617 b = _mm_insert_epi32(b, *(
const uint32_t *)(ref_data + (y + 1) * ref_stride + leftoff), 1);
618 a = _mm_insert_epi32(a, *(
const uint32_t *)(pic_data + (y + 2) * pic_stride), 2);
619 b = _mm_insert_epi32(b, *(
const uint32_t *)(ref_data + (y + 2) * ref_stride + leftoff), 2);
620 a = _mm_insert_epi32(a, *(
const uint32_t *)(pic_data + (y + 3) * pic_stride), 3);
621 b = _mm_insert_epi32(b, *(
const uint32_t *)(ref_data + (y + 3) * ref_stride + leftoff), 3);
623 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
624 __m128i curr_sads = _mm_sad_epu8 (a, b_epol);
625 sse_inc = _mm_add_epi64 (sse_inc, curr_sads);
627 if (height_residual_lines) {
628 for (; y < height; y++) {
629 __m128i a = _mm_cvtsi32_si128(*(
const uint32_t *)(pic_data + y * pic_stride));
630 __m128i b = _mm_cvtsi32_si128(*(
const uint32_t *)(ref_data + y * ref_stride + leftoff));
632 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
633 __m128i curr_sads = _mm_sad_epu8 (a, b_epol);
634 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
637 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
638 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
640 return _mm_cvtsi128_si32(sad);
643static uint32_t hor_sad_sse41_w8(
const uint8_t *pic_data,
const uint8_t *ref_data,
644 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
645 uint32_t left, uint32_t right)
649 const int32_t right_border_idx = 7 - right;
650 const int32_t border_idx = left ? left : right_border_idx;
652 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
653 8, 9, 10, 11, 12, 13, 14, 15);
658 const __m128i qwbaseids = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
659 8, 8, 8, 8, 8, 8, 8, 8);
669 const int32_t border_idx_negative = border_idx >> 31;
670 const int32_t leftoff = border_idx_negative | left;
672 __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
673 __m128i left_128 = _mm_set1_epi8((int8_t)left);
675 right_border_idxs = _mm_add_epi8 (right_border_idxs, qwbaseids);
680 __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
681 __m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
687 const __m128i epol_mask = _mm_max_epi8(mask1, qwbaseids);
689 const int32_t height_fourline_groups = height & ~3;
690 const int32_t height_residual_lines = height & 3;
692 __m128i sse_inc = _mm_setzero_si128();
694 for (y = 0; y < height_fourline_groups; y += 4) {
695 __m128d a_d = _mm_setzero_pd();
696 __m128d b_d = _mm_setzero_pd();
697 __m128d c_d = _mm_setzero_pd();
698 __m128d d_d = _mm_setzero_pd();
700 a_d = _mm_loadl_pd(a_d, (
const double *)(pic_data + (y + 0) * pic_stride));
701 b_d = _mm_loadl_pd(b_d, (
const double *)(ref_data + (y + 0) * ref_stride + leftoff));
702 a_d = _mm_loadh_pd(a_d, (
const double *)(pic_data + (y + 1) * pic_stride));
703 b_d = _mm_loadh_pd(b_d, (
const double *)(ref_data + (y + 1) * ref_stride + leftoff));
705 c_d = _mm_loadl_pd(c_d, (
const double *)(pic_data + (y + 2) * pic_stride));
706 d_d = _mm_loadl_pd(d_d, (
const double *)(ref_data + (y + 2) * ref_stride + leftoff));
707 c_d = _mm_loadh_pd(c_d, (
const double *)(pic_data + (y + 3) * pic_stride));
708 d_d = _mm_loadh_pd(d_d, (
const double *)(ref_data + (y + 3) * ref_stride + leftoff));
710 __m128i a = _mm_castpd_si128(a_d);
711 __m128i b = _mm_castpd_si128(b_d);
712 __m128i c = _mm_castpd_si128(c_d);
713 __m128i d = _mm_castpd_si128(d_d);
715 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
716 __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
718 __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
719 __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
720 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
721 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
723 if (height_residual_lines) {
724 for (; y < height; y++) {
725 __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * pic_stride));
726 __m128i b = _mm_loadl_epi64((__m128i *)(ref_data + y * ref_stride + leftoff));
728 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
730 __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
731 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
734 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
735 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
736 return _mm_cvtsi128_si32(sad);
756static uint32_t hor_sad_sse41_w16(
const uint8_t *pic_data,
const uint8_t *ref_data,
757 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
758 const uint32_t left,
const uint32_t right)
762 const int32_t right_border_idx = 15 - right;
763 const int32_t border_idx = left ? left : right_border_idx;
765 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
766 8, 9, 10, 11, 12, 13, 14, 15);
767 const __m128i zero = _mm_setzero_si128();
777 const int32_t border_idx_negative = border_idx >> 31;
778 const int32_t leftoff = border_idx_negative | left;
780 __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
781 __m128i left_128 = _mm_set1_epi8((int8_t)left);
786 __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
787 __m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
793 const __m128i epol_mask = _mm_max_epi8(mask1, zero);
795 const int32_t height_fourline_groups = height & ~3;
796 const int32_t height_residual_lines = height & 3;
798 __m128i sse_inc = _mm_setzero_si128();
800 for (y = 0; y < height_fourline_groups; y += 4) {
801 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
802 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
803 __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride));
804 __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + leftoff));
805 __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * pic_stride));
806 __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 2) * ref_stride + leftoff));
807 __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * pic_stride));
808 __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 3) * ref_stride + leftoff));
810 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
811 __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
812 __m128i f_epol = _mm_shuffle_epi8(f, epol_mask);
813 __m128i h_epol = _mm_shuffle_epi8(h, epol_mask);
815 __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
816 __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
817 __m128i curr_sads_ef = _mm_sad_epu8(e, f_epol);
818 __m128i curr_sads_gh = _mm_sad_epu8(g, h_epol);
820 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
821 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
822 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
823 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
825 if (height_residual_lines) {
826 for (; y < height; y++) {
827 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
828 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
829 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
830 __m128i curr_sads = _mm_sad_epu8(a, b_epol);
831 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
834 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
835 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
836 return _mm_cvtsi128_si32(sad);
839static INLINE uint32_t hor_sad_sse41_arbitrary(
const uint8_t *pic_data,
const uint8_t *ref_data,
840 int32_t width, int32_t height, uint32_t pic_stride,
841 uint32_t ref_stride, uint32_t left, uint32_t right)
843 __m128i sse_inc = _mm_setzero_si128();
845 const size_t vec_width = 16;
846 const size_t vecwid_bitmask = 15;
847 const size_t vec_width_log2 = 4;
849 const int32_t height_fourline_groups = height & ~3;
850 const int32_t height_residual_lines = height & 3;
852 const __m128i rights = _mm_set1_epi8((uint8_t)right);
853 const __m128i blk_widths = _mm_set1_epi8((uint8_t)width);
854 const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width);
855 const __m128i nslo = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
857 uint32_t outside_vecs, inside_vecs, left_offset, is_left_bm;
858 int32_t outside_width, inside_width, border_off, invec_lstart,
859 invec_lend, invec_linc;
861 outside_vecs = left >> vec_width_log2;
862 inside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - outside_vecs;
863 outside_width = outside_vecs * vec_width;
864 inside_width = inside_vecs * vec_width;
868 invec_lend = inside_vecs;
872 inside_vecs = ((width - right) + vecwid_bitmask) >> vec_width_log2;
873 outside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - inside_vecs;
874 outside_width = outside_vecs * vec_width;
875 inside_width = inside_vecs * vec_width;
876 left_offset = right - width;
877 border_off = width - 1 - right;
878 invec_lstart = inside_vecs - 1;
883 left_offset &= vecwid_bitmask;
885 const __m128i left_offsets = _mm_set1_epi8 ((uint8_t)left_offset);
886 const __m128i is_left = _mm_cmpeq_epi8(rights, _mm_setzero_si128());
887 const __m128i vw_for_left = _mm_and_si128 (is_left, vec_widths);
891 const __m128i offs_neg = _mm_xor_si128 (left_offsets, is_left);
892 const __m128i offs_for_sm1 = _mm_sub_epi8 (offs_neg, is_left);
894 const __m128i ns_for_sm1 = _mm_or_si128 (vw_for_left, nslo);
895 const __m128i shufmask1 = _mm_add_epi8 (ns_for_sm1, offs_for_sm1);
897 const __m128i mo2bmask_l = _mm_cmpgt_epi8(left_offsets, nslo);
898 const __m128i mo2bimask_l = _mm_cmpeq_epi8(mo2bmask_l, _mm_setzero_si128());
899 const __m128i mo2bimask_r = _mm_cmpgt_epi8(vec_widths, shufmask1);
900 const __m128i move_old_to_b_imask = _mm_blendv_epi8(mo2bimask_r, mo2bimask_l, is_left);
902 const int32_t outvec_offset = (~is_left_bm) & inside_width;
904 for (y = 0; y < height_fourline_groups; y += 4) {
905 __m128i borderpx_vec_b = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]);
906 __m128i borderpx_vec_d = _mm_set1_epi8(ref_data[(int32_t)((y + 1) * ref_stride + border_off)]);
907 __m128i borderpx_vec_f = _mm_set1_epi8(ref_data[(int32_t)((y + 2) * ref_stride + border_off)]);
908 __m128i borderpx_vec_h = _mm_set1_epi8(ref_data[(int32_t)((y + 3) * ref_stride + border_off)]);
910 for (x = 0; x < outside_vecs; x++) {
911 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
912 __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + outvec_offset));
913 __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + outvec_offset));
914 __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + outvec_offset));
916 __m128i startoffs = _mm_set1_epi8 ((x + inside_vecs) << vec_width_log2);
917 __m128i ns = _mm_add_epi8 (startoffs, nslo);
920 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
921 unrd_imask = _mm_or_si128 (unrd_imask, is_left);
922 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
924 __m128i b_unread = _mm_blendv_epi8(borderpx_vec_b, a, unrd_mask);
925 __m128i d_unread = _mm_blendv_epi8(borderpx_vec_d, c, unrd_mask);
926 __m128i f_unread = _mm_blendv_epi8(borderpx_vec_f, e, unrd_mask);
927 __m128i h_unread = _mm_blendv_epi8(borderpx_vec_h, g, unrd_mask);
929 __m128i sad_ab = _mm_sad_epu8 (a, b_unread);
930 __m128i sad_cd = _mm_sad_epu8 (c, d_unread);
931 __m128i sad_ef = _mm_sad_epu8 (e, f_unread);
932 __m128i sad_gh = _mm_sad_epu8 (g, h_unread);
934 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
935 sse_inc = _mm_add_epi64(sse_inc, sad_cd);
936 sse_inc = _mm_add_epi64(sse_inc, sad_ef);
937 sse_inc = _mm_add_epi64(sse_inc, sad_gh);
939 int32_t a_off = outside_width & is_left_bm;
940 int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
942 __m128i old_b = borderpx_vec_b;
943 __m128i old_d = borderpx_vec_d;
944 __m128i old_f = borderpx_vec_f;
945 __m128i old_h = borderpx_vec_h;
947 for (x = invec_lstart; x != invec_lend; x += invec_linc) {
948 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
949 __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + a_off));
950 __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + a_off));
951 __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + a_off));
952 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
953 __m128i d = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 1) * ref_stride + a_off - leftoff_with_sign_neg));
954 __m128i f = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 2) * ref_stride + a_off - leftoff_with_sign_neg));
955 __m128i h = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 3) * ref_stride + a_off - leftoff_with_sign_neg));
957 __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1);
958 __m128i d_shifted = _mm_shuffle_epi8(d, shufmask1);
959 __m128i f_shifted = _mm_shuffle_epi8(f, shufmask1);
960 __m128i h_shifted = _mm_shuffle_epi8(h, shufmask1);
962 __m128i b_with_old = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
963 __m128i d_with_old = _mm_blendv_epi8 (old_d, d_shifted, move_old_to_b_imask);
964 __m128i f_with_old = _mm_blendv_epi8 (old_f, f_shifted, move_old_to_b_imask);
965 __m128i h_with_old = _mm_blendv_epi8 (old_h, h_shifted, move_old_to_b_imask);
967 uint8_t startoff = (x << vec_width_log2) + a_off;
968 __m128i startoffs = _mm_set1_epi8 (startoff);
969 __m128i curr_ns = _mm_add_epi8 (startoffs, nslo);
970 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, curr_ns);
971 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
973 __m128i b_unread = _mm_blendv_epi8 (b_with_old, a, unrd_mask);
974 __m128i d_unread = _mm_blendv_epi8 (d_with_old, c, unrd_mask);
975 __m128i f_unread = _mm_blendv_epi8 (f_with_old, e, unrd_mask);
976 __m128i h_unread = _mm_blendv_epi8 (h_with_old, g, unrd_mask);
983 __m128i sad_ab = _mm_sad_epu8(a, b_unread);
984 __m128i sad_cd = _mm_sad_epu8(c, d_unread);
985 __m128i sad_ef = _mm_sad_epu8(e, f_unread);
986 __m128i sad_gh = _mm_sad_epu8(g, h_unread);
988 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
989 sse_inc = _mm_add_epi64(sse_inc, sad_cd);
990 sse_inc = _mm_add_epi64(sse_inc, sad_ef);
991 sse_inc = _mm_add_epi64(sse_inc, sad_gh);
994 if (height_residual_lines) {
995 for (; y < height; y++) {
996 __m128i borderpx_vec = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]);
997 for (x = 0; x < outside_vecs; x++) {
998 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
1000 __m128i startoffs = _mm_set1_epi8 ((x + inside_vecs) << vec_width_log2);
1001 __m128i ns = _mm_add_epi8 (startoffs, nslo);
1004 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
1005 unrd_imask = _mm_or_si128 (unrd_imask, is_left);
1006 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
1007 __m128i b_unread = _mm_blendv_epi8(borderpx_vec, a, unrd_mask);
1009 __m128i sad_ab = _mm_sad_epu8 (a, b_unread);
1010 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
1012 int32_t a_off = outside_width & is_left_bm;
1013 int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
1015 __m128i old_b = borderpx_vec;
1016 for (x = invec_lstart; x != invec_lend; x += invec_linc) {
1017 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
1018 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
1020 __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1);
1021 __m128i b_with_old = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
1023 uint8_t startoff = (x << vec_width_log2) + a_off;
1024 __m128i startoffs = _mm_set1_epi8 (startoff);
1025 __m128i curr_ns = _mm_add_epi8 (startoffs, nslo);
1026 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, curr_ns);
1027 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
1028 __m128i b_unread = _mm_blendv_epi8 (b_with_old, a, unrd_mask);
1032 __m128i sad_ab = _mm_sad_epu8(a, b_unread);
1033 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
1037 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
1038 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
1039 return _mm_cvtsi128_si32(sad);
#define INLINE
Definition global.h:240
This file defines the public API of Kvazaar when used as a library.
#define _mm_bsrli_si128(a, imm8)
Definition missing-intel-intrinsics.h:8