48 __m128i x0 = _mm_loadu_si128((__m128i*)address);
49 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
50 x0 = _mm_or_si128(x0, x1);
51 x1 = _mm_shuffle_epi32(x0, 0xEE);
52 x0 = _mm_or_si128(x0, x1);
53 x1 = _mm_shuffle_epi32(x0, 0x55);
54 x0 = _mm_or_si128(x0, x1);
55 ui32 t = (
ui32)_mm_extract_epi32(x0, 0);
62 __m128i x0 = _mm_loadu_si128((__m128i*)address);
63 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
64 x0 = _mm_or_si128(x0, x1);
65 x1 = _mm_shuffle_epi32(x0, 0xEE);
66 x0 = _mm_or_si128(x0, x1);
67 ui64 t = (
ui64)_mm_extract_epi64(x0, 0);
73 float delta_inv,
ui32 count,
ui32* max_val)
78 ui32 shift = 31 - K_max;
79 __m256i m0 = _mm256_set1_epi32(INT_MIN);
80 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
81 __m256i *p = (__m256i*)sp;
82 for (
ui32 i = 0; i < count; i += 8, p += 1, dp += 8)
84 __m256i v = _mm256_loadu_si256(p);
85 __m256i sign = _mm256_and_si256(v, m0);
86 __m256i val = _mm256_abs_epi32(v);
87 val = _mm256_slli_epi32(val, (
int)shift);
88 tmax = _mm256_or_si256(tmax, val);
89 val = _mm256_or_si256(val, sign);
90 _mm256_storeu_si256((__m256i*)dp, val);
92 _mm256_storeu_si256((__m256i*)max_val, tmax);
97 float delta_inv,
ui32 count,
ui32* max_val)
102 __m256 d = _mm256_set1_ps(delta_inv);
103 __m256i m0 = _mm256_set1_epi32(INT_MIN);
104 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
105 float *p = (
float*)sp;
107 for (
ui32 i = 0; i < count; i += 8, p += 8, dp += 8)
109 __m256 vf = _mm256_loadu_ps(p);
110 vf = _mm256_mul_ps(vf, d);
111 __m256i val = _mm256_cvtps_epi32(vf);
112 __m256i sign = _mm256_and_si256(val, m0);
113 val = _mm256_abs_epi32(val);
114 tmax = _mm256_or_si256(tmax, val);
115 val = _mm256_or_si256(val, sign);
116 _mm256_storeu_si256((__m256i*)dp, val);
118 _mm256_storeu_si256((__m256i*)max_val, tmax);
123 float delta,
ui32 count)
126 ui32 shift = 31 - K_max;
127 __m256i m1 = _mm256_set1_epi32(INT_MAX);
129 for (
ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
131 __m256i v = _mm256_load_si256((__m256i*)sp);
132 __m256i val = _mm256_and_si256(v, m1);
133 val = _mm256_srli_epi32(val, (
int)shift);
134 val = _mm256_sign_epi32(val, v);
135 _mm256_storeu_si256((__m256i*)p, val);
141 float delta,
ui32 count)
144 __m256i m1 = _mm256_set1_epi32(INT_MAX);
145 __m256 d = _mm256_set1_ps(delta);
146 float *p = (
float*)dp;
147 for (
ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
149 __m256i v = _mm256_load_si256((__m256i*)sp);
150 __m256i vali = _mm256_and_si256(v, m1);
151 __m256 valf = _mm256_cvtepi32_ps(vali);
152 valf = _mm256_mul_ps(valf, d);
153 __m256i sign = _mm256_andnot_si256(m1, v);
154 valf = _mm256_or_ps(valf, _mm256_castsi256_ps(sign));
155 _mm256_storeu_ps(p, valf);
161 float delta_inv,
ui32 count,
ui64* max_val)
166 ui32 shift = 63 - K_max;
167 __m256i m0 = _mm256_set1_epi64x(LLONG_MIN);
168 __m256i zero = _mm256_setzero_si256();
169 __m256i one = _mm256_set1_epi64x(1);
170 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
171 __m256i *p = (__m256i*)sp;
172 for (
ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
174 __m256i v = _mm256_loadu_si256(p);
175 __m256i sign = _mm256_cmpgt_epi64(zero, v);
176 __m256i val = _mm256_xor_si256(v, sign);
177 __m256i ones = _mm256_and_si256(sign, one);
178 val = _mm256_add_epi64(val, ones);
179 sign = _mm256_and_si256(sign, m0);
180 val = _mm256_slli_epi64(val, (
int)shift);
181 tmax = _mm256_or_si256(tmax, val);
182 val = _mm256_or_si256(val, sign);
183 _mm256_storeu_si256((__m256i*)dp, val);
185 _mm256_storeu_si256((__m256i*)max_val, tmax);
190 float delta,
ui32 count)
194 ui32 shift = 63 - K_max;
195 __m256i m1 = _mm256_set1_epi64x(LLONG_MAX);
196 __m256i zero = _mm256_setzero_si256();
197 __m256i one = _mm256_set1_epi64x(1);
199 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
201 __m256i v = _mm256_load_si256((__m256i*)sp);
202 __m256i val = _mm256_and_si256(v, m1);
203 val = _mm256_srli_epi64(val, (
int)shift);
204 __m256i sign = _mm256_cmpgt_epi64(zero, v);
205 val = _mm256_xor_si256(val, sign);
206 __m256i ones = _mm256_and_si256(sign, one);
207 val = _mm256_add_epi64(val, ones);
208 _mm256_storeu_si256((__m256i*)p, val);