26#if HWY_COMPILER_GCC_ACTUAL
33#if HWY_TARGET == HWY_SSSE3
46#include <sanitizer/msan_interface.h>
69template <
typename T,
size_t N = 16 /
sizeof(T)>
75 static constexpr size_t kPrivateN =
N;
80 return *
this = (*
this * other);
83 return *
this = (*
this / other);
86 return *
this = (*
this + other);
89 return *
this = (*
this - other);
92 return *
this = (*
this & other);
95 return *
this = (*
this | other);
98 return *
this = (*
this ^ other);
105using Vec64 = Vec128<T, 8 /
sizeof(T)>;
108using Vec32 = Vec128<T, 4 /
sizeof(T)>;
110#if HWY_TARGET <= HWY_AVX3
115template <
size_t size>
136template <
typename T,
size_t N = 16 /
sizeof(T)>
150template <
typename T,
size_t N = 16 /
sizeof(T)>
158using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
161using TFromV =
typename V::PrivateT;
171template <
typename T,
size_t N>
173 return Vec128<uint8_t,
N *
sizeof(T)>{BitCastToInteger(
v.raw)};
190template <
typename T,
size_t N>
198template <
typename T,
size_t N,
typename FromT>
200 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
207template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
209 return Vec128<T, N>{_mm_setzero_si128()};
211template <
size_t N, HWY_IF_LE128(
float, N)>
213 return Vec128<float, N>{_mm_setzero_ps()};
215template <
size_t N, HWY_IF_LE128(
double, N)>
226template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
227HWY_API Vec128<uint8_t, N>
Set(Simd<uint8_t, N, 0> ,
const uint8_t t) {
228 return Vec128<uint8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
230template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
231HWY_API Vec128<uint16_t, N>
Set(Simd<uint16_t, N, 0> ,
233 return Vec128<uint16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
235template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
236HWY_API Vec128<uint32_t, N>
Set(Simd<uint32_t, N, 0> ,
238 return Vec128<uint32_t, N>{_mm_set1_epi32(
static_cast<int>(t))};
240template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
241HWY_API Vec128<uint64_t, N>
Set(Simd<uint64_t, N, 0> ,
243 return Vec128<uint64_t, N>{
244 _mm_set1_epi64x(
static_cast<long long>(t))};
246template <
size_t N, HWY_IF_LE128(
int8_t, N)>
247HWY_API Vec128<int8_t, N>
Set(Simd<int8_t, N, 0> ,
const int8_t t) {
248 return Vec128<int8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
250template <
size_t N, HWY_IF_LE128(
int16_t, N)>
251HWY_API Vec128<int16_t, N>
Set(Simd<int16_t, N, 0> ,
const int16_t t) {
252 return Vec128<int16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
254template <
size_t N, HWY_IF_LE128(
int32_t, N)>
255HWY_API Vec128<int32_t, N>
Set(Simd<int32_t, N, 0> ,
const int32_t t) {
256 return Vec128<int32_t, N>{_mm_set1_epi32(t)};
258template <
size_t N, HWY_IF_LE128(
int64_t, N)>
259HWY_API Vec128<int64_t, N>
Set(Simd<int64_t, N, 0> ,
const int64_t t) {
260 return Vec128<int64_t, N>{
261 _mm_set1_epi64x(
static_cast<long long>(t))};
263template <
size_t N, HWY_IF_LE128(
float, N)>
264HWY_API Vec128<float, N>
Set(Simd<float, N, 0> ,
const float t) {
265 return Vec128<float, N>{_mm_set1_ps(t)};
267template <
size_t N, HWY_IF_LE128(
double, N)>
276template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
280 return Vec128<T, N>{_mm_undefined_si128()};
282template <
size_t N, HWY_IF_LE128(
float, N)>
286template <
size_t N, HWY_IF_LE128(
double, N)>
296template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
298 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw) & 0xFF);
300template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
302 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw) & 0xFFFF);
304template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
306 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw));
310 return _mm_cvtss_f32(
v.raw);
315 alignas(16) uint64_t lanes[2];
319 return static_cast<uint64_t
>(_mm_cvtsi128_si64(
v.raw));
325 alignas(16) int64_t lanes[2];
329 return _mm_cvtsi128_si64(
v.raw);
334 return _mm_cvtsd_f64(
v.raw);
341template <
typename T,
size_t N>
342HWY_API Vec128<T, N>
And(Vec128<T, N>
a, Vec128<T, N> b) {
343 return Vec128<T, N>{_mm_and_si128(
a.raw, b.raw)};
359template <
typename T,
size_t N>
360HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
361 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
376template <
typename T,
size_t N>
377HWY_API Vec128<T, N>
Or(Vec128<T, N>
a, Vec128<T, N> b) {
378 return Vec128<T, N>{_mm_or_si128(
a.raw, b.raw)};
394template <
typename T,
size_t N>
395HWY_API Vec128<T, N>
Xor(Vec128<T, N>
a, Vec128<T, N> b) {
396 return Vec128<T, N>{_mm_xor_si128(
a.raw, b.raw)};
411template <
typename T,
size_t N>
415 using VU =
VFromD<
decltype(du)>;
416#if HWY_TARGET <= HWY_AVX3
418 return BitCast(
d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
425template <
typename T,
size_t N>
426HWY_API Vec128<T, N>
Xor3(Vec128<T, N>
x1, Vec128<T, N> x2, Vec128<T, N> x3) {
427#if HWY_TARGET <= HWY_AVX3
430 using VU =
VFromD<
decltype(du)>;
431 const __m128i ret = _mm_ternarylogic_epi64(
440template <
typename T,
size_t N>
441HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
442#if HWY_TARGET <= HWY_AVX3
445 using VU =
VFromD<
decltype(du)>;
446 const __m128i ret = _mm_ternarylogic_epi64(
450 return Or(o1,
Or(o2, o3));
455template <
typename T,
size_t N>
456HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
457#if HWY_TARGET <= HWY_AVX3
460 using VU =
VFromD<
decltype(du)>;
461 const __m128i ret = _mm_ternarylogic_epi64(
465 return Or(o,
And(a1, a2));
470template <
typename T,
size_t N>
473#if HWY_TARGET <= HWY_AVX3
476 using VU =
VFromD<
decltype(du)>;
478 d, VU{_mm_ternarylogic_epi64(
BitCast(du, mask).raw,
BitCast(du, yes).raw,
487template <
typename T,
size_t N>
492template <
typename T,
size_t N>
497template <
typename T,
size_t N>
505#if HWY_TARGET == HWY_AVX3_DL
507#ifdef HWY_NATIVE_POPCNT
508#undef HWY_NATIVE_POPCNT
510#define HWY_NATIVE_POPCNT
515template <
typename T,
size_t N>
520template <
typename T,
size_t N>
525template <
typename T,
size_t N>
530template <
typename T,
size_t N>
538template <
typename T,
size_t N>
552template <
typename T,
size_t N>
557template <
typename T,
size_t N>
564template <
typename T,
size_t N>
573HWY_API Vec128<int8_t, N>
Abs(
const Vec128<int8_t, N>
v) {
577 return Vec128<int8_t, N>{_mm_max_epi8(
v.raw, (zero -
v).raw)};
579 return Vec128<int8_t, N>{_mm_abs_epi8(
v.raw)};
583HWY_API Vec128<int16_t, N>
Abs(
const Vec128<int16_t, N>
v) {
584 return Vec128<int16_t, N>{_mm_abs_epi16(
v.raw)};
587HWY_API Vec128<int32_t, N>
Abs(
const Vec128<int32_t, N>
v) {
588 return Vec128<int32_t, N>{_mm_abs_epi32(
v.raw)};
592HWY_API Vec128<float, N>
Abs(
const Vec128<float, N>
v) {
593 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
604template <
typename T,
size_t N>
606 const Vec128<T, N> sign) {
607 static_assert(
IsFloat<T>(),
"Only makes sense for floating-point");
609 const DFromV<
decltype(magn)>
d;
612#if HWY_TARGET <= HWY_AVX3
624 const __m128i out = _mm_ternarylogic_epi32(
632template <
typename T,
size_t N>
634 const Vec128<T, N> sign) {
635#if HWY_TARGET <= HWY_AVX3
651 __msan_unpoison(unaligned, count *
sizeof(T));
660#if HWY_TARGET <= HWY_AVX3
669template <
typename T,
size_t N>
675template <
typename T,
size_t N>
681template <
typename T,
size_t N>
687template <
typename T,
size_t N>
696template <
typename T,
size_t N>
717template <
typename T,
size_t N>
722template <
typename T,
size_t N>
727template <
typename T,
size_t N>
732template <
typename T,
size_t N>
740template <
typename T,
size_t N>
759template <
typename T,
size_t N>
765template <
typename T,
size_t N>
770template <
typename T,
size_t N>
775template <
typename T,
size_t N>
783template <
typename T,
size_t N>
803#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
804#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
805 HWY_COMPILER_CLANG >= 800
806#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
808#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
814template <
typename T,
size_t N>
817#if HWY_COMPILER_HAS_MASK_INTRINSICS
823template <
typename T,
size_t N>
826#if HWY_COMPILER_HAS_MASK_INTRINSICS
832template <
typename T,
size_t N>
835#if HWY_COMPILER_HAS_MASK_INTRINSICS
841template <
typename T,
size_t N>
844#if HWY_COMPILER_HAS_MASK_INTRINSICS
851template <
typename T,
size_t N>
854#if HWY_COMPILER_HAS_MASK_INTRINSICS
860template <
typename T,
size_t N>
863#if HWY_COMPILER_HAS_MASK_INTRINSICS
869template <
typename T,
size_t N>
872#if HWY_COMPILER_HAS_MASK_INTRINSICS
878template <
typename T,
size_t N>
881#if HWY_COMPILER_HAS_MASK_INTRINSICS
888template <
typename T,
size_t N>
891#if HWY_COMPILER_HAS_MASK_INTRINSICS
897template <
typename T,
size_t N>
900#if HWY_COMPILER_HAS_MASK_INTRINSICS
906template <
typename T,
size_t N>
909#if HWY_COMPILER_HAS_MASK_INTRINSICS
915template <
typename T,
size_t N>
918#if HWY_COMPILER_HAS_MASK_INTRINSICS
925template <
typename T,
size_t N>
928#if HWY_COMPILER_HAS_MASK_INTRINSICS
934template <
typename T,
size_t N>
937#if HWY_COMPILER_HAS_MASK_INTRINSICS
943template <
typename T,
size_t N>
946#if HWY_COMPILER_HAS_MASK_INTRINSICS
952template <
typename T,
size_t N>
955#if HWY_COMPILER_HAS_MASK_INTRINSICS
962template <
typename T,
size_t N>
966#if HWY_COMPILER_HAS_MASK_INTRINSICS
972template <
typename T,
size_t N>
976#if HWY_COMPILER_HAS_MASK_INTRINSICS
982template <
typename T,
size_t N>
986#if HWY_COMPILER_HAS_MASK_INTRINSICS
992template <
typename T,
size_t N>
996#if HWY_COMPILER_HAS_MASK_INTRINSICS
1005template <
typename T,
size_t N>
1006HWY_API Mask128<T, N>
And(
const Mask128<T, N>
a, Mask128<T, N> b) {
1010template <
typename T,
size_t N>
1011HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N>
a, Mask128<T, N> b) {
1015template <
typename T,
size_t N>
1016HWY_API Mask128<T, N>
Or(
const Mask128<T, N>
a, Mask128<T, N> b) {
1020template <
typename T,
size_t N>
1021HWY_API Mask128<T, N>
Xor(
const Mask128<T, N>
a, Mask128<T, N> b) {
1025template <
typename T,
size_t N>
1026HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1032template <
typename T,
size_t N>
1042template <
typename T,
size_t N>
1044 return Mask128<T, N>{
v.raw};
1047template <
typename T,
size_t N>
1049 return Vec128<T, N>{
v.raw};
1052template <
typename T,
size_t N>
1054 const Mask128<T, N>
v) {
1055 return Vec128<T, N>{
v.raw};
1058#if HWY_TARGET == HWY_SSSE3
1061template <
typename T,
size_t N>
1071template <
typename T,
size_t N>
1074 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1078 const Vec128<float, N> yes,
1079 const Vec128<float, N> no) {
1080 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1084 const Vec128<double, N> yes,
1085 const Vec128<double, N> no) {
1086 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1092template <
typename T,
size_t N>
1098template <
typename T,
size_t N>
1105template <
typename T,
size_t N>
1106HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1110template <
typename T,
size_t N>
1111HWY_API Mask128<T, N>
And(
const Mask128<T, N>
a, Mask128<T, N> b) {
1112 const Simd<T, N, 0>
d;
1116template <
typename T,
size_t N>
1117HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N>
a, Mask128<T, N> b) {
1118 const Simd<T, N, 0>
d;
1122template <
typename T,
size_t N>
1123HWY_API Mask128<T, N>
Or(
const Mask128<T, N>
a, Mask128<T, N> b) {
1124 const Simd<T, N, 0>
d;
1128template <
typename T,
size_t N>
1129HWY_API Mask128<T, N>
Xor(
const Mask128<T, N>
a, Mask128<T, N> b) {
1130 const Simd<T, N, 0>
d;
1134template <
typename T,
size_t N>
1136 const Simd<T, N, 0>
d;
1144template <
int kBits,
size_t N>
1146 return Vec128<uint16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
1149template <
int kBits,
size_t N>
1151 return Vec128<uint32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
1154template <
int kBits,
size_t N>
1156 return Vec128<uint64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
1159template <
int kBits,
size_t N>
1161 return Vec128<int16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
1163template <
int kBits,
size_t N>
1165 return Vec128<int32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
1167template <
int kBits,
size_t N>
1169 return Vec128<int64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
1172template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1179 : (shifted &
Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1184template <
int kBits,
size_t N>
1186 return Vec128<uint16_t, N>{_mm_srli_epi16(
v.raw, kBits)};
1188template <
int kBits,
size_t N>
1190 return Vec128<uint32_t, N>{_mm_srli_epi32(
v.raw, kBits)};
1192template <
int kBits,
size_t N>
1194 return Vec128<uint64_t, N>{_mm_srli_epi64(
v.raw, kBits)};
1197template <
int kBits,
size_t N>
1201 const Vec128<uint8_t, N> shifted{
1203 return shifted &
Set(d8, 0xFF >> kBits);
1206template <
int kBits,
size_t N>
1208 return Vec128<int16_t, N>{_mm_srai_epi16(
v.raw, kBits)};
1210template <
int kBits,
size_t N>
1212 return Vec128<int32_t, N>{_mm_srai_epi32(
v.raw, kBits)};
1215template <
int kBits,
size_t N>
1220 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1221 return (shifted ^ shifted_sign) - shifted_sign;
1229template <
typename T,
size_t N,
typename TI,
size_t NI>
1231 const Vec128<TI, NI> from) {
1232 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
1237template <
class V,
class VI>
1250template <
typename T,
size_t N>
1252 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
1253 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1254 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, 0xB1)};
1258 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1267template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1271 alignas(16)
const T kShuffle[8] = {1, 0, 7, 6};
1274template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1278 alignas(16)
const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
1281template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1285 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
1290template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1294 alignas(16)
const T kShuffle[8] = {0, 3, 6, 5};
1297template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1301 alignas(16)
const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
1304template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1308 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
1313template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1317 alignas(16)
const T kShuffle[8] = {2, 1, 4, 7};
1320template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1324 alignas(16)
const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
1327template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1331 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
1392#if HWY_TARGET <= HWY_AVX3
1396template <
typename TFrom,
size_t NFrom,
typename TTo,
size_t NTo>
1399 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1405template <
typename T,
size_t N>
1410template <
typename T,
size_t N>
1415template <
typename T,
size_t N>
1420template <
typename T,
size_t N>
1428template <
typename T,
size_t N>
1429HWY_API Mask128<T, N>
TestBit(
const Vec128<T, N>
v,
const Vec128<T, N> bit) {
1436template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1441template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1443 return Mask128<T, N>{_mm_cmpeq_epi16_mask(
a.raw, b.raw)};
1446template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1448 return Mask128<T, N>{_mm_cmpeq_epi32_mask(
a.raw, b.raw)};
1451template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1453 return Mask128<T, N>{_mm_cmpeq_epi64_mask(
a.raw, b.raw)};
1458 return Mask128<float, N>{_mm_cmp_ps_mask(
a.raw, b.raw, _CMP_EQ_OQ)};
1469template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1474template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1476 return Mask128<T, N>{_mm_cmpneq_epi16_mask(
a.raw, b.raw)};
1479template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1481 return Mask128<T, N>{_mm_cmpneq_epi32_mask(
a.raw, b.raw)};
1484template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1486 return Mask128<T, N>{_mm_cmpneq_epi64_mask(
a.raw, b.raw)};
1491 return Mask128<float, N>{_mm_cmp_ps_mask(
a.raw, b.raw, _CMP_NEQ_OQ)};
1505 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(
a.raw, b.raw)};
1509 Vec128<int16_t, N> b) {
1510 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(
a.raw, b.raw)};
1514 Vec128<int32_t, N> b) {
1515 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(
a.raw, b.raw)};
1519 Vec128<int64_t, N> b) {
1520 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(
a.raw, b.raw)};
1525 Vec128<uint8_t, N> b) {
1526 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(
a.raw, b.raw)};
1530 Vec128<uint16_t, N> b) {
1531 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(
a.raw, b.raw)};
1535 Vec128<uint32_t, N> b) {
1536 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(
a.raw, b.raw)};
1540 Vec128<uint64_t, N> b) {
1541 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(
a.raw, b.raw)};
1546 return Mask128<float, N>{_mm_cmp_ps_mask(
a.raw, b.raw, _CMP_GT_OQ)};
1557 return Mask128<float, N>{_mm_cmp_ps_mask(
a.raw, b.raw, _CMP_GE_OQ)};
1569template <
typename T,
size_t N>
1574template <
typename T,
size_t N>
1579template <
typename T,
size_t N>
1584template <
typename T,
size_t N>
1592template <
typename T,
size_t N>
1608template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1613template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1615 return Vec128<T, N>{_mm_movm_epi16(
v.raw)};
1618template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1620 return Vec128<T, N>{_mm_movm_epi32(
v.raw)};
1623template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1625 return Vec128<T, N>{_mm_movm_epi64(
v.raw)};
1638template <
typename T,
size_t N>
1640 const Mask128<T, N>
v) {
1648template <
typename TFrom,
typename TTo,
size_t N>
1650 Mask128<TFrom, N> m) {
1651 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1652 const Simd<TFrom, N, 0>
d;
1656template <
typename T,
size_t N>
1659 return (
v & bit) == bit;
1667 const Vec128<uint8_t, N> b) {
1668 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(
a.raw, b.raw)};
1672 const Vec128<uint16_t, N> b) {
1673 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(
a.raw, b.raw)};
1677 const Vec128<uint32_t, N> b) {
1678 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(
a.raw, b.raw)};
1682 const Vec128<uint64_t, N> b) {
1683#if HWY_TARGET == HWY_SSSE3
1684 const Simd<uint32_t, N * 2, 0> d32;
1685 const Simd<uint64_t, N, 0> d64;
1690 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(
a.raw, b.raw)};
1697 const Vec128<int8_t, N> b) {
1698 return Mask128<int8_t, N>{_mm_cmpeq_epi8(
a.raw, b.raw)};
1702 Vec128<int16_t, N> b) {
1703 return Mask128<int16_t, N>{_mm_cmpeq_epi16(
a.raw, b.raw)};
1707 const Vec128<int32_t, N> b) {
1708 return Mask128<int32_t, N>{_mm_cmpeq_epi32(
a.raw, b.raw)};
1712 const Vec128<int64_t, N> b) {
1722 const Vec128<float, N> b) {
1723 return Mask128<float, N>{_mm_cmpeq_ps(
a.raw, b.raw)};
1727 const Vec128<double, N> b) {
1728 return Mask128<double, N>{_mm_cmpeq_pd(
a.raw, b.raw)};
1738 Vec128<uint8_t, N> b) {
1743 Vec128<uint16_t, N> b) {
1748 Vec128<uint32_t, N> b) {
1753 Vec128<uint64_t, N> b) {
1758 Vec128<int8_t, N> b) {
1763 Vec128<int16_t, N> b) {
1768 Vec128<int32_t, N> b) {
1773 Vec128<int64_t, N> b) {
1779 const Vec128<float, N> b) {
1780 return Mask128<float, N>{_mm_cmpneq_ps(
a.raw, b.raw)};
1784 const Vec128<double, N> b) {
1785 return Mask128<double, N>{_mm_cmpneq_pd(
a.raw, b.raw)};
1794 Vec128<int8_t, N> b) {
1795 return Mask128<int8_t, N>{_mm_cmpgt_epi8(
a.raw, b.raw)};
1799 Vec128<int16_t, N> b) {
1800 return Mask128<int16_t, N>{_mm_cmpgt_epi16(
a.raw, b.raw)};
1804 Vec128<int32_t, N> b) {
1805 return Mask128<int32_t, N>{_mm_cmpgt_epi32(
a.raw, b.raw)};
1810 const Vec128<int64_t, N>
a,
1811 const Vec128<int64_t, N> b) {
1812#if HWY_TARGET == HWY_SSSE3
1814 const Simd<int64_t, N, 0>
d;
1816 const Vec128<int64_t, N> m_eq32{Eq(
BitCast(d32,
a),
BitCast(d32, b)).raw};
1820 const __m128i upper =
OrAnd(m_gt32, m_eq32,
Sub(b,
a)).raw;
1822 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
1824 return Mask128<int64_t, N>{_mm_cmpgt_epi64(
a.raw, b.raw)};
1828template <
typename T,
size_t N>
1841 Vec128<float, N> b) {
1842 return Mask128<float, N>{_mm_cmpgt_ps(
a.raw, b.raw)};
1846 Vec128<double, N> b) {
1847 return Mask128<double, N>{_mm_cmpgt_pd(
a.raw, b.raw)};
1852template <
typename T,
size_t N>
1861 const Vec128<float, N> b) {
1862 return Mask128<float, N>{_mm_cmpge_ps(
a.raw, b.raw)};
1866 const Vec128<double, N> b) {
1867 return Mask128<double, N>{_mm_cmpge_pd(
a.raw, b.raw)};
1874template <
typename T,
size_t N>
1879template <
typename T,
size_t N>
1886template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1888#if HWY_TARGET <= HWY_AVX3
1890 const uint64_t all = (1ull <<
N) - 1;
1892 const uint64_t
bits = (num > 255) ? all : _bzhi_u64(all, num);
1911#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1912#if defined(__clang_analyzer__) || \
1913 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1914#define HWY_SAFE_PARTIAL_LOAD_STORE 1
1916#define HWY_SAFE_PARTIAL_LOAD_STORE 0
1922template <
typename T>
1924 return Vec128<T>{_mm_load_si128(
reinterpret_cast<const __m128i*
>(aligned))};
1935template <
typename T>
1937 return Vec128<T>{_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(p))};
1941 return Vec128<float>{_mm_loadu_ps(p)};
1948template <
typename T>
1950#if HWY_SAFE_PARTIAL_LOAD_STORE
1951 __m128i
v = _mm_setzero_si128();
1955 return Vec64<T>{_mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(p))};
1961#if HWY_SAFE_PARTIAL_LOAD_STORE
1962 __m128
v = _mm_setzero_ps();
1966 const __m128 hi = _mm_setzero_ps();
1967 return Vec128<float, 2>{_mm_loadl_pi(hi,
reinterpret_cast<const __m64*
>(p))};
1973#if HWY_SAFE_PARTIAL_LOAD_STORE
1974 __m128d
v = _mm_setzero_pd();
1984#if HWY_SAFE_PARTIAL_LOAD_STORE
1985 __m128
v = _mm_setzero_ps();
1994template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
1996 constexpr size_t kSize =
sizeof(T) *
N;
1997#if HWY_SAFE_PARTIAL_LOAD_STORE
1998 __m128
v = _mm_setzero_ps();
2000 return Vec128<T, N>{
v};
2004 return Vec128<T, N>{_mm_cvtsi32_si128(
bits)};
2009template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2015template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2021template <
typename T,
size_t N,
typename T2, HWY_IF_LE128(T, N)>
2024 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
2028 return Load(
d, lanes);
2033#if HWY_TARGET <= HWY_AVX3
2035template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2041template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2044 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
2047template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2050 return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
2053template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2056 return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
2073#elif HWY_TARGET == HWY_AVX2
2075template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2078 auto p_p =
reinterpret_cast<const int*
>(p);
2079 return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
2082template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2085 auto p_p =
reinterpret_cast<const long long*
>(p);
2086 return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
2092 const Vec128<int32_t, N> mi =
2094 return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
2100 const Vec128<int64_t, N> mi =
2102 return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
2106template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 6)>
2115template <
typename T,
size_t N>
2125template <
typename T>
2127 _mm_store_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
2131 _mm_store_ps(aligned,
v.raw);
2135 _mm_store_pd(aligned,
v.raw);
2138template <
typename T>
2140 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(p),
v.raw);
2144 _mm_storeu_ps(p,
v.raw);
2148 _mm_storeu_pd(p,
v.raw);
2151template <
typename T>
2153#if HWY_SAFE_PARTIAL_LOAD_STORE
2156 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(p),
v.raw);
2161#if HWY_SAFE_PARTIAL_LOAD_STORE
2164 _mm_storel_pi(
reinterpret_cast<__m64*
>(p),
v.raw);
2169#if HWY_SAFE_PARTIAL_LOAD_STORE
2172 _mm_storel_pd(p,
v.raw);
2177template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
2183#if HWY_SAFE_PARTIAL_LOAD_STORE
2186 _mm_store_ss(p,
v.raw);
2191template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2204template <
typename T,
size_t N>
2208 using TI =
TFromD<
decltype(di)>;
2209 alignas(16) TI
buf[
N];
2210 alignas(16) TI mask[
N];
2213 for (
size_t i = 0; i <
N; ++i) {
2221#if HWY_TARGET <= HWY_AVX3
2223template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2226 _mm_mask_storeu_epi8(p, m.
raw,
v.raw);
2228template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2231 _mm_mask_storeu_epi16(p, m.raw,
v.raw);
2234template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2237 auto pi =
reinterpret_cast<int*
>(p);
2238 _mm_mask_storeu_epi32(pi, m.raw,
v.raw);
2241template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2244 auto pi =
reinterpret_cast<long long*
>(p);
2245 _mm_mask_storeu_epi64(pi, m.raw,
v.raw);
2251 _mm_mask_storeu_ps(p, m.
raw,
v.raw);
2257 _mm_mask_storeu_pd(p, m.
raw,
v.raw);
2260#elif HWY_TARGET == HWY_AVX2
2262template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 6)>
2268template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2274 const Mask128<T> mf{m.raw};
2275 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2278 auto pi =
reinterpret_cast<int*
>(p);
2279 _mm_maskstore_epi32(pi, m.raw,
v.raw);
2282template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2288 const Mask128<T> mf{m.raw};
2289 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2292 auto pi =
reinterpret_cast<long long*
>(p);
2293 _mm_maskstore_epi64(pi, m.raw,
v.raw);
2303 const Mask128<T> mf{m.raw};
2304 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2307 const Vec128<MakeSigned<T>,
N> mi =
2309 _mm_maskstore_ps(p, mi.raw,
v.raw);
2319 const Mask128<T> mf{m.raw};
2320 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2323 const Vec128<MakeSigned<T>,
N> mi =
2325 _mm_maskstore_pd(p, mi.raw,
v.raw);
2330template <
typename T,
size_t N>
2346 const Vec128<uint8_t, N> b) {
2347 return Vec128<uint8_t, N>{_mm_add_epi8(
a.raw, b.raw)};
2351 const Vec128<uint16_t, N> b) {
2352 return Vec128<uint16_t, N>{_mm_add_epi16(
a.raw, b.raw)};
2356 const Vec128<uint32_t, N> b) {
2357 return Vec128<uint32_t, N>{_mm_add_epi32(
a.raw, b.raw)};
2361 const Vec128<uint64_t, N> b) {
2362 return Vec128<uint64_t, N>{_mm_add_epi64(
a.raw, b.raw)};
2368 const Vec128<int8_t, N> b) {
2369 return Vec128<int8_t, N>{_mm_add_epi8(
a.raw, b.raw)};
2373 const Vec128<int16_t, N> b) {
2374 return Vec128<int16_t, N>{_mm_add_epi16(
a.raw, b.raw)};
2378 const Vec128<int32_t, N> b) {
2379 return Vec128<int32_t, N>{_mm_add_epi32(
a.raw, b.raw)};
2383 const Vec128<int64_t, N> b) {
2384 return Vec128<int64_t, N>{_mm_add_epi64(
a.raw, b.raw)};
2390 const Vec128<float, N> b) {
2391 return Vec128<float, N>{_mm_add_ps(
a.raw, b.raw)};
2404 const Vec128<uint8_t, N> b) {
2405 return Vec128<uint8_t, N>{_mm_sub_epi8(
a.raw, b.raw)};
2409 Vec128<uint16_t, N> b) {
2410 return Vec128<uint16_t, N>{_mm_sub_epi16(
a.raw, b.raw)};
2414 const Vec128<uint32_t, N> b) {
2415 return Vec128<uint32_t, N>{_mm_sub_epi32(
a.raw, b.raw)};
2419 const Vec128<uint64_t, N> b) {
2420 return Vec128<uint64_t, N>{_mm_sub_epi64(
a.raw, b.raw)};
2426 const Vec128<int8_t, N> b) {
2427 return Vec128<int8_t, N>{_mm_sub_epi8(
a.raw, b.raw)};
2431 const Vec128<int16_t, N> b) {
2432 return Vec128<int16_t, N>{_mm_sub_epi16(
a.raw, b.raw)};
2436 const Vec128<int32_t, N> b) {
2437 return Vec128<int32_t, N>{_mm_sub_epi32(
a.raw, b.raw)};
2441 const Vec128<int64_t, N> b) {
2442 return Vec128<int64_t, N>{_mm_sub_epi64(
a.raw, b.raw)};
2448 const Vec128<float, N> b) {
2449 return Vec128<float, N>{_mm_sub_ps(
a.raw, b.raw)};
2460 return Vec128<uint64_t,
N / 8>{_mm_sad_epu8(
v.raw, _mm_setzero_si128())};
2470 const Vec128<uint8_t, N> b) {
2471 return Vec128<uint8_t, N>{_mm_adds_epu8(
a.raw, b.raw)};
2475 const Vec128<uint16_t, N> b) {
2476 return Vec128<uint16_t, N>{_mm_adds_epu16(
a.raw, b.raw)};
2482 const Vec128<int8_t, N> b) {
2483 return Vec128<int8_t, N>{_mm_adds_epi8(
a.raw, b.raw)};
2487 const Vec128<int16_t, N> b) {
2488 return Vec128<int16_t, N>{_mm_adds_epi16(
a.raw, b.raw)};
2498 const Vec128<uint8_t, N> b) {
2499 return Vec128<uint8_t, N>{_mm_subs_epu8(
a.raw, b.raw)};
2503 const Vec128<uint16_t, N> b) {
2504 return Vec128<uint16_t, N>{_mm_subs_epu16(
a.raw, b.raw)};
2510 const Vec128<int8_t, N> b) {
2511 return Vec128<int8_t, N>{_mm_subs_epi8(
a.raw, b.raw)};
2515 const Vec128<int16_t, N> b) {
2516 return Vec128<int16_t, N>{_mm_subs_epi16(
a.raw, b.raw)};
2526 const Vec128<uint8_t, N> b) {
2527 return Vec128<uint8_t, N>{_mm_avg_epu8(
a.raw, b.raw)};
2531 const Vec128<uint16_t, N> b) {
2532 return Vec128<uint16_t, N>{_mm_avg_epu16(
a.raw, b.raw)};
2539 const Vec128<uint16_t, N> b) {
2540 return Vec128<uint16_t, N>{_mm_mullo_epi16(
a.raw, b.raw)};
2544 const Vec128<int16_t, N> b) {
2545 return Vec128<int16_t, N>{_mm_mullo_epi16(
a.raw, b.raw)};
2551 const Vec128<uint16_t, N> b) {
2552 return Vec128<uint16_t, N>{_mm_mulhi_epu16(
a.raw, b.raw)};
2556 const Vec128<int16_t, N> b) {
2557 return Vec128<int16_t, N>{_mm_mulhi_epi16(
a.raw, b.raw)};
2562 const Vec128<int16_t, N> b) {
2563 return Vec128<int16_t, N>{_mm_mulhrs_epi16(
a.raw, b.raw)};
2570 const Vec128<uint32_t, N> b) {
2571 return Vec128<uint64_t, (
N + 1) / 2>{_mm_mul_epu32(
a.raw, b.raw)};
2574#if HWY_TARGET == HWY_SSSE3
2576template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2579 return Set(
Simd<int64_t, (
N + 1) / 2, 0>(),
2583 const Vec128<int32_t> b) {
2584 alignas(16) int32_t a_lanes[4];
2585 alignas(16) int32_t b_lanes[4];
2588 Store(b, di32, b_lanes);
2589 alignas(16) int64_t mul[2];
2590 mul[0] =
static_cast<int64_t
>(a_lanes[0]) * b_lanes[0];
2591 mul[1] =
static_cast<int64_t
>(a_lanes[2]) * b_lanes[2];
2599 const Vec128<int32_t, N> b) {
2600 return Vec128<int64_t, (
N + 1) / 2>{_mm_mul_epi32(
a.raw, b.raw)};
2607 const Vec128<uint32_t, N> b) {
2608#if HWY_TARGET == HWY_SSSE3
2612 const __m128i a_x3x1 = _mm_shuffle_epi32(
a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2613 const auto mullo_x2x0 =
MulEven(
a, b);
2614 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2615 const auto mullo_x3x1 =
2616 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2619 const __m128i mul_20 =
2620 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2621 const __m128i mul_31 =
2622 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2623 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2625 return Vec128<uint32_t, N>{_mm_mullo_epi32(
a.raw, b.raw)};
2631 const Vec128<int32_t, N> b) {
2640template <
int kBits,
size_t N>
2642 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
2643#if HWY_TARGET <= HWY_AVX3
2644 return Vec128<uint32_t, N>{_mm_ror_epi32(
v.raw, kBits)};
2646 if (kBits == 0)
return v;
2651template <
int kBits,
size_t N>
2653 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
2654#if HWY_TARGET <= HWY_AVX3
2655 return Vec128<uint64_t, N>{_mm_ror_epi64(
v.raw, kBits)};
2657 if (kBits == 0)
return v;
2683#if HWY_TARGET <= HWY_AVX3
2686#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2694 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2699HWY_API Vec128<int64_t, N>
Abs(
const Vec128<int64_t, N>
v) {
2700#if HWY_TARGET <= HWY_AVX3
2701 return Vec128<int64_t, N>{_mm_abs_epi64(
v.raw)};
2708template <
int kBits,
size_t N>
2710#if HWY_TARGET <= HWY_AVX3
2711 return Vec128<int64_t, N>{_mm_srai_epi64(
v.raw, kBits)};
2717 return right | sign;
2722template <
typename T,
size_t N>
2724 static_assert(
IsFloat<T>(),
"Only works for float");
2726#if HWY_TARGET == HWY_SSSE3
2744template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2747 static_assert(
IsSigned<T>(),
"Only works for signed/float");
2756template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2759 static_assert(
IsSigned<T>(),
"Only works for signed/float");
2773 return Vec128<uint16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(
bits))};
2778 return Vec128<uint32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(
bits))};
2783 return Vec128<uint64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(
bits))};
2789 return Vec128<int16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(
bits))};
2795 return Vec128<int32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(
bits))};
2801 return Vec128<int64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(
bits))};
2804template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2808 const Vec128<T, N> shifted{
2810 return shifted &
Set(d8,
static_cast<T
>((0xFF <<
bits) & 0xFF));
2818 return Vec128<uint16_t, N>{_mm_srl_epi16(
v.raw, _mm_cvtsi32_si128(
bits))};
2823 return Vec128<uint32_t, N>{_mm_srl_epi32(
v.raw, _mm_cvtsi32_si128(
bits))};
2828 return Vec128<uint64_t, N>{_mm_srl_epi64(
v.raw, _mm_cvtsi32_si128(
bits))};
2836 const Vec128<uint8_t, N> shifted{
2838 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >>
bits));
2844 return Vec128<int16_t, N>{_mm_sra_epi16(
v.raw, _mm_cvtsi32_si128(
bits))};
2850 return Vec128<int32_t, N>{_mm_sra_epi32(
v.raw, _mm_cvtsi32_si128(
bits))};
2855#if HWY_TARGET <= HWY_AVX3
2856 return Vec128<int64_t, N>{_mm_sra_epi64(
v.raw, _mm_cvtsi32_si128(
bits))};
2862 return right | sign;
2871 const auto shifted_sign =
2873 return (shifted ^ shifted_sign) - shifted_sign;
2880 return Vec128<float, N>{_mm_mul_ps(
a.raw, b.raw)};
2897 const Vec128<float, N> b) {
2898 return Vec128<float, N>{_mm_div_ps(
a.raw, b.raw)};
2916 return Vec128<float, N>{_mm_rcp_ps(
v.raw)};
2925 const Vec128<float, N> b) {
2934 const Vec128<float, N>
x,
2935 const Vec128<float, N> add) {
2936#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2937 return mul *
x + add;
2939 return Vec128<float, N>{_mm_fmadd_ps(mul.raw,
x.raw, add.raw)};
2946#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2947 return mul *
x + add;
2956 const Vec128<float, N>
x,
2957 const Vec128<float, N> add) {
2958#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2959 return add - mul *
x;
2961 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw,
x.raw, add.raw)};
2968#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2969 return add - mul *
x;
2978 const Vec128<float, N>
x,
2979 const Vec128<float, N> sub) {
2980#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2981 return mul *
x - sub;
2983 return Vec128<float, N>{_mm_fmsub_ps(mul.raw,
x.raw, sub.raw)};
2990#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2991 return mul *
x - sub;
3000 const Vec128<float, N>
x,
3001 const Vec128<float, N> sub) {
3002#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3003 return Neg(mul) *
x - sub;
3005 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw,
x.raw, sub.raw)};
3012#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3013 return Neg(mul) *
x - sub;
3023HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
3024 return Vec128<float, N>{_mm_sqrt_ps(
v.raw)};
3040 return Vec128<float, N>{_mm_rsqrt_ps(
v.raw)};
3050template <
typename T,
size_t N>
3056 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
3065HWY_API Vec128<uint8_t, N>
Min(
const Vec128<uint8_t, N>
a,
3066 const Vec128<uint8_t, N> b) {
3067 return Vec128<uint8_t, N>{_mm_min_epu8(
a.raw, b.raw)};
3070HWY_API Vec128<uint16_t, N>
Min(
const Vec128<uint16_t, N>
a,
3071 const Vec128<uint16_t, N> b) {
3072#if HWY_TARGET == HWY_SSSE3
3075 return Vec128<uint16_t, N>{_mm_min_epu16(
a.raw, b.raw)};
3079HWY_API Vec128<uint32_t, N>
Min(
const Vec128<uint32_t, N>
a,
3080 const Vec128<uint32_t, N> b) {
3081#if HWY_TARGET == HWY_SSSE3
3084 return Vec128<uint32_t, N>{_mm_min_epu32(
a.raw, b.raw)};
3088HWY_API Vec128<uint64_t, N>
Min(
const Vec128<uint64_t, N>
a,
3089 const Vec128<uint64_t, N> b) {
3090#if HWY_TARGET <= HWY_AVX3
3091 return Vec128<uint64_t, N>{_mm_min_epu64(
a.raw, b.raw)};
3099HWY_API Vec128<int8_t, N>
Min(
const Vec128<int8_t, N>
a,
3100 const Vec128<int8_t, N> b) {
3101#if HWY_TARGET == HWY_SSSE3
3104 return Vec128<int8_t, N>{_mm_min_epi8(
a.raw, b.raw)};
3108HWY_API Vec128<int16_t, N>
Min(
const Vec128<int16_t, N>
a,
3109 const Vec128<int16_t, N> b) {
3110 return Vec128<int16_t, N>{_mm_min_epi16(
a.raw, b.raw)};
3113HWY_API Vec128<int32_t, N>
Min(
const Vec128<int32_t, N>
a,
3114 const Vec128<int32_t, N> b) {
3115#if HWY_TARGET == HWY_SSSE3
3118 return Vec128<int32_t, N>{_mm_min_epi32(
a.raw, b.raw)};
3122HWY_API Vec128<int64_t, N>
Min(
const Vec128<int64_t, N>
a,
3123 const Vec128<int64_t, N> b) {
3124#if HWY_TARGET <= HWY_AVX3
3125 return Vec128<int64_t, N>{_mm_min_epi64(
a.raw, b.raw)};
3133HWY_API Vec128<float, N>
Min(
const Vec128<float, N>
a,
3134 const Vec128<float, N> b) {
3135 return Vec128<float, N>{_mm_min_ps(
a.raw, b.raw)};
3146template <
typename T,
size_t N>
3152 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
3161HWY_API Vec128<uint8_t, N>
Max(
const Vec128<uint8_t, N>
a,
3162 const Vec128<uint8_t, N> b) {
3163 return Vec128<uint8_t, N>{_mm_max_epu8(
a.raw, b.raw)};
3166HWY_API Vec128<uint16_t, N>
Max(
const Vec128<uint16_t, N>
a,
3167 const Vec128<uint16_t, N> b) {
3168#if HWY_TARGET == HWY_SSSE3
3171 return Vec128<uint16_t, N>{_mm_max_epu16(
a.raw, b.raw)};
3175HWY_API Vec128<uint32_t, N>
Max(
const Vec128<uint32_t, N>
a,
3176 const Vec128<uint32_t, N> b) {
3177#if HWY_TARGET == HWY_SSSE3
3180 return Vec128<uint32_t, N>{_mm_max_epu32(
a.raw, b.raw)};
3184HWY_API Vec128<uint64_t, N>
Max(
const Vec128<uint64_t, N>
a,
3185 const Vec128<uint64_t, N> b) {
3186#if HWY_TARGET <= HWY_AVX3
3187 return Vec128<uint64_t, N>{_mm_max_epu64(
a.raw, b.raw)};
3195HWY_API Vec128<int8_t, N>
Max(
const Vec128<int8_t, N>
a,
3196 const Vec128<int8_t, N> b) {
3197#if HWY_TARGET == HWY_SSSE3
3200 return Vec128<int8_t, N>{_mm_max_epi8(
a.raw, b.raw)};
3204HWY_API Vec128<int16_t, N>
Max(
const Vec128<int16_t, N>
a,
3205 const Vec128<int16_t, N> b) {
3206 return Vec128<int16_t, N>{_mm_max_epi16(
a.raw, b.raw)};
3209HWY_API Vec128<int32_t, N>
Max(
const Vec128<int32_t, N>
a,
3210 const Vec128<int32_t, N> b) {
3211#if HWY_TARGET == HWY_SSSE3
3214 return Vec128<int32_t, N>{_mm_max_epi32(
a.raw, b.raw)};
3218HWY_API Vec128<int64_t, N>
Max(
const Vec128<int64_t, N>
a,
3219 const Vec128<int64_t, N> b) {
3220#if HWY_TARGET <= HWY_AVX3
3221 return Vec128<int64_t, N>{_mm_max_epi64(
a.raw, b.raw)};
3229HWY_API Vec128<float, N>
Max(
const Vec128<float, N>
a,
3230 const Vec128<float, N> b) {
3231 return Vec128<float, N>{_mm_max_ps(
a.raw, b.raw)};
3245template <
typename T,
size_t N>
3248 _mm_stream_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
3253 _mm_stream_ps(aligned,
v.raw);
3258 _mm_stream_pd(aligned,
v.raw);
3271#if HWY_TARGET <= HWY_AVX3
3274template <
typename T,
size_t N>
3279 _mm_i32scatter_epi32(base,
offset.raw,
v.raw, 1);
3281 const __mmask8 mask = (1u <<
N) - 1;
3282 _mm_mask_i32scatter_epi32(base, mask,
offset.raw,
v.raw, 1);
3285template <
typename T,
size_t N>
3290 _mm_i32scatter_epi32(base, index.
raw,
v.raw, 4);
3292 const __mmask8 mask = (1u <<
N) - 1;
3293 _mm_mask_i32scatter_epi32(base, mask, index.
raw,
v.raw, 4);
3297template <
typename T,
size_t N>
3302 _mm_i64scatter_epi64(base,
offset.raw,
v.raw, 1);
3304 const __mmask8 mask = (1u <<
N) - 1;
3305 _mm_mask_i64scatter_epi64(base, mask,
offset.raw,
v.raw, 1);
3308template <
typename T,
size_t N>
3313 _mm_i64scatter_epi64(base, index.
raw,
v.raw, 8);
3315 const __mmask8 mask = (1u <<
N) - 1;
3316 _mm_mask_i64scatter_epi64(base, mask, index.
raw,
v.raw, 8);
3322template <
typename T,
size_t N,
typename Offset>
3326 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3329template <
typename T,
size_t N,
typename Index>
3331 const Vec128<Index, N> index) {
3332 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3341 _mm_i32scatter_ps(base,
offset.raw,
v.raw, 1);
3343 const __mmask8 mask = (1u <<
N) - 1;
3344 _mm_mask_i32scatter_ps(base, mask,
offset.raw,
v.raw, 1);
3352 _mm_i32scatter_ps(base, index.
raw,
v.raw, 4);
3354 const __mmask8 mask = (1u <<
N) - 1;
3355 _mm_mask_i32scatter_ps(base, mask, index.
raw,
v.raw, 4);
3364 _mm_i64scatter_pd(base,
offset.raw,
v.raw, 1);
3366 const __mmask8 mask = (1u <<
N) - 1;
3367 _mm_mask_i64scatter_pd(base, mask,
offset.raw,
v.raw, 1);
3375 _mm_i64scatter_pd(base, index.
raw,
v.raw, 8);
3377 const __mmask8 mask = (1u <<
N) - 1;
3378 _mm_mask_i64scatter_pd(base, mask, index.
raw,
v.raw, 8);
3383template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
3386 const Vec128<Offset, N>
offset) {
3387 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3389 alignas(16) T lanes[
N];
3392 alignas(16) Offset offset_lanes[
N];
3395 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
3396 for (
size_t i = 0; i <
N; ++i) {
3401template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
3403 const Vec128<Index, N> index) {
3404 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3406 alignas(16) T lanes[
N];
3409 alignas(16) Index index_lanes[
N];
3410 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
3412 for (
size_t i = 0; i <
N; ++i) {
3413 base[index_lanes[i]] = lanes[i];
3421#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3423template <
typename T,
size_t N,
typename Offset>
3426 const Vec128<Offset, N>
offset) {
3427 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3429 alignas(16) Offset offset_lanes[
N];
3432 alignas(16) T lanes[
N];
3433 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
3434 for (
size_t i = 0; i <
N; ++i) {
3437 return Load(
d, lanes);
3440template <
typename T,
size_t N,
typename Index>
3443 const Vec128<Index, N> index) {
3444 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3446 alignas(16) Index index_lanes[
N];
3447 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
3449 alignas(16) T lanes[
N];
3450 for (
size_t i = 0; i <
N; ++i) {
3451 lanes[i] = base[index_lanes[i]];
3453 return Load(
d, lanes);
3460template <
typename T,
size_t N>
3464 const Vec128<int32_t, N>
offset) {
3465 return Vec128<T, N>{_mm_i32gather_epi32(
3466 reinterpret_cast<const int32_t*
>(base),
offset.raw, 1)};
3468template <
typename T,
size_t N>
3472 const Vec128<int32_t, N> index) {
3473 return Vec128<T, N>{_mm_i32gather_epi32(
3474 reinterpret_cast<const int32_t*
>(base), index.raw, 4)};
3477template <
typename T,
size_t N>
3481 const Vec128<int64_t, N>
offset) {
3482 return Vec128<T, N>{_mm_i64gather_epi64(
3485template <
typename T,
size_t N>
3489 const Vec128<int64_t, N> index) {
3490 return Vec128<T, N>{_mm_i64gather_epi64(
3491 reinterpret_cast<const GatherIndex64*
>(base), index.raw, 8)};
3496template <
typename T,
size_t N,
typename Offset>
3498 const Vec128<Offset, N>
offset) {
3501template <
typename T,
size_t N,
typename Index>
3503 const Vec128<Index, N> index) {
3510 const Vec128<int32_t, N>
offset) {
3511 return Vec128<float, N>{_mm_i32gather_ps(base,
offset.raw, 1)};
3516 const Vec128<int32_t, N> index) {
3517 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3523 const Vec128<int64_t, N>
offset) {
3524 return Vec128<double, N>{_mm_i64gather_pd(base,
offset.raw, 1)};
3529 const Vec128<int64_t, N> index) {
3530 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3542template <
typename T,
size_t N>
3545 return Vec128<T,
N / 2>{
v.raw};
3548template <
typename T,
size_t N>
3555template <
int kBytes,
typename T,
size_t N>
3557 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3558 return Vec128<T, N>{_mm_slli_si128(
v.raw, kBytes)};
3561template <
int kBytes,
typename T,
size_t N>
3568template <
int kLanes,
typename T,
size_t N>
3574template <
int kLanes,
typename T,
size_t N>
3580template <
int kBytes,
typename T,
size_t N>
3582 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3584 if (
N != 16 /
sizeof(T)) {
3585 const Vec128<T> vfull{
v.raw};
3588 return Vec128<T, N>{_mm_srli_si128(
v.raw, kBytes)};
3592template <
int kLanes,
typename T,
size_t N>
3601template <
typename T>
3603 return Vec64<T>{_mm_unpackhi_epi64(
v.raw,
v.raw)};
3606 return Vec128<float, 2>{_mm_movehl_ps(
v.raw,
v.raw)};
3613template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3620 return Vec128<T, (
N + 1) / 2>{upper.raw};
3627template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3629 static_assert(kLane <
N,
"Lane index out of bounds");
3630#if HWY_TARGET == HWY_SSSE3
3631 const int pair = _mm_extract_epi16(
v.raw, kLane / 2);
3632 constexpr int kShift = kLane & 1 ? 8 : 0;
3633 return static_cast<T
>((pair >> kShift) & 0xFF);
3635 return static_cast<T
>(_mm_extract_epi8(
v.raw, kLane) & 0xFF);
3639template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3641 static_assert(kLane <
N,
"Lane index out of bounds");
3642 return static_cast<T
>(_mm_extract_epi16(
v.raw, kLane) & 0xFFFF);
3645template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3647 static_assert(kLane <
N,
"Lane index out of bounds");
3648#if HWY_TARGET == HWY_SSSE3
3649 alignas(16) T lanes[4];
3651 return lanes[kLane];
3653 return static_cast<T
>(_mm_extract_epi32(
v.raw, kLane));
3657template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3659 static_assert(kLane <
N,
"Lane index out of bounds");
3660#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3661 alignas(16) T lanes[2];
3663 return lanes[kLane];
3665 return static_cast<T
>(_mm_extract_epi64(
v.raw, kLane));
3669template <
size_t kLane,
size_t N>
3671 static_assert(kLane <
N,
"Lane index out of bounds");
3672#if HWY_TARGET == HWY_SSSE3
3673 alignas(16)
float lanes[4];
3675 return lanes[kLane];
3678 const int32_t
bits = _mm_extract_ps(
v.raw, kLane);
3686template <
size_t kLane>
3688 static_assert(kLane == 0,
"Lane index out of bounds");
3692template <
size_t kLane>
3694 static_assert(kLane < 2,
"Lane index out of bounds");
3703template <
typename T>
3710template <
typename T>
3712#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3713 if (__builtin_constant_p(i)) {
3722 alignas(16) T lanes[2];
3727template <
typename T>
3729#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3730 if (__builtin_constant_p(i)) {
3743 alignas(16) T lanes[4];
3748template <
typename T>
3750#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3751 if (__builtin_constant_p(i)) {
3772 alignas(16) T lanes[8];
3777template <
typename T>
3779#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3780 if (__builtin_constant_p(i)) {
3817 alignas(16) T lanes[16];
3826template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3828 static_assert(kLane <
N,
"Lane index out of bounds");
3829#if HWY_TARGET == HWY_SSSE3
3831 alignas(16) T lanes[16];
3834 return Load(
d, lanes);
3836 return Vec128<T, N>{_mm_insert_epi8(
v.raw, t, kLane)};
3840template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3842 static_assert(kLane <
N,
"Lane index out of bounds");
3843 return Vec128<T, N>{_mm_insert_epi16(
v.raw, t, kLane)};
3846template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3848 static_assert(kLane <
N,
"Lane index out of bounds");
3849#if HWY_TARGET == HWY_SSSE3
3850 alignas(16) T lanes[4];
3854 return Load(
d, lanes);
3858 return Vec128<T, N>{_mm_insert_epi32(
v.raw, ti, kLane)};
3862template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3864 static_assert(kLane <
N,
"Lane index out of bounds");
3865#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3867 alignas(16) T lanes[2];
3870 return Load(
d, lanes);
3874 return Vec128<T, N>{_mm_insert_epi64(
v.raw, ti, kLane)};
3878template <
size_t kLane,
size_t N>
3880 static_assert(kLane <
N,
"Lane index out of bounds");
3881#if HWY_TARGET == HWY_SSSE3
3883 alignas(16)
float lanes[4];
3886 return Load(
d, lanes);
3888 return Vec128<float, N>{_mm_insert_ps(
v.raw, _mm_set_ss(t), kLane << 4)};
3893template <
size_t kLane>
3895 static_assert(kLane == 0,
"Lane index out of bounds");
3899template <
size_t kLane>
3901 static_assert(kLane < 2,
"Lane index out of bounds");
3915template <
typename T>
3922template <
typename T>
3924#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3925 if (__builtin_constant_p(i)) {
3935 alignas(16) T lanes[2];
3938 return Load(
d, lanes);
3941template <
typename T>
3943#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3944 if (__builtin_constant_p(i)) {
3958 alignas(16) T lanes[4];
3961 return Load(
d, lanes);
3964template <
typename T>
3966#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3967 if (__builtin_constant_p(i)) {
3989 alignas(16) T lanes[8];
3992 return Load(
d, lanes);
3995template <
typename T>
3997#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3998 if (__builtin_constant_p(i)) {
4036 alignas(16) T lanes[16];
4039 return Load(
d, lanes);
4044template <
int kBytes,
typename T,
class V = Vec128<T>>
4047 return BitCast(
d, Vec128<uint8_t>{_mm_alignr_epi8(
4051template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
4052 class V = Vec128<T, N>>
4054 constexpr size_t kSize =
N *
sizeof(T);
4055 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
4058 using V8 =
VFromD<
decltype(d_full8)>;
4059 const V8 hi8{
BitCast(d8, hi).raw};
4069template <
int kLane,
size_t N>
4071 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4073 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
4076 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
4080template <
int kLane,
size_t N>
4082 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4085template <
int kLane,
size_t N>
4087 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4092template <
int kLane,
size_t N>
4094 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4096 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
4099 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
4103template <
int kLane,
size_t N>
4105 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4108template <
int kLane,
size_t N>
4110 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4115template <
int kLane,
size_t N>
4117 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4120template <
int kLane,
size_t N>
4122 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4129template <
typename T,
size_t N = 16 /
sizeof(T)>
4137 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
4138#if HWY_IS_DEBUG_BUILD
4139 const Rebind<TI,
decltype(
d)> di;
4144#if HWY_TARGET <= HWY_AVX2
4149 using V8 =
VFromD<
decltype(d8)>;
4150 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
4151 0, 1, 2, 3, 0, 1, 2, 3};
4154 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
4155 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
4169 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
4170#if HWY_IS_DEBUG_BUILD
4171 const Rebind<TI,
decltype(
d)> di;
4173 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(
N)))));
4179 return Indices128<T, N>{vec.raw};
4182template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
4184 const Rebind<TI,
decltype(
d)> di;
4188template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4190#if HWY_TARGET <= HWY_AVX2
4200template <
size_t N, HWY_IF_GE64(
float, N)>
4203#if HWY_TARGET <= HWY_AVX2
4214template <
typename T>
4220template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4224#if HWY_TARGET <= HWY_AVX2
4244#if HWY_TARGET <= HWY_AVX2
4262template <
typename T>
4270template <
typename T>
4276template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4281template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4287template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4293template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4295#if HWY_TARGET <= HWY_AVX3
4296 if (
N == 1)
return v;
4302 alignas(16)
constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4303 const Vec128<int16_t, N> idx =
Load(di, kReverse + (
N == 8 ? 0 : 4));
4304 return BitCast(
d, Vec128<int16_t, N>{
4305 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4315template <
typename T>
4320template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4322 alignas(16)
const T kShuffle[16] = {1, 0, 3, 2, 5, 4, 7, 6,
4323 9, 8, 11, 10, 13, 12, 15, 14};
4327template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4333template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4338template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4345template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4350 return BitCast(
d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
4351 BitCast(di,
v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
4354#if HWY_TARGET <= HWY_AVX3
4355 alignas(16)
constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
4356 const Vec128<int16_t, N> idx =
Load(di, kReverse4);
4357 return BitCast(
d, Vec128<int16_t, N>{
4358 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4366template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4371template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4378template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4380#if HWY_TARGET <= HWY_AVX3
4382 alignas(32)
constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
4383 15, 14, 13, 12, 11, 10, 9, 8};
4384 const Vec128<int16_t, N> idx =
Load(di, kReverse8);
4385 return BitCast(
d, Vec128<int16_t, N>{
4386 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4393template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4404template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
4409template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
4414template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
4419template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
4425template <
size_t N, HWY_IF_LE128(
int8_t, N)>
4430template <
size_t N, HWY_IF_LE128(
int16_t, N)>
4435template <
size_t N, HWY_IF_LE128(
int32_t, N)>
4440template <
size_t N, HWY_IF_LE128(
int64_t, N)>
4446template <
size_t N, HWY_IF_LE128(
float, N)>
4448 const Vec128<float, N> b) {
4449 return Vec128<float, N>{_mm_unpacklo_ps(
a.raw, b.raw)};
4451template <
size_t N, HWY_IF_LE128(
double, N)>
4503 const Vec128<float> b) {
4504 return Vec128<float>{_mm_unpackhi_ps(
a.raw, b.raw)};
4514template <
typename T,
class V = Vec128<T>>
4520template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
4522 const Half<
decltype(
d)> d2;
4530template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4534template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4539template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4549template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4550HWY_API Vec128<T, N>
Combine(Simd<T, N, 0>
d, Vec128<T, N / 2> hi_half,
4551 Vec128<T, N / 2> lo_half) {
4552 const Half<
decltype(
d)> d2;
4556 const VU lo{
BitCast(du2, lo_half).raw};
4557 const VU hi{
BitCast(du2, hi_half).raw};
4566template <
typename T>
4572template <
typename T>
4581template <
typename T>
4586template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4594template <
typename T>
4601template <
typename T>
4608template <
typename T>
4610 const Vec128<T> lo) {
4615template <
typename T>
4618#if HWY_TARGET == HWY_SSSE3
4621 _MM_SHUFFLE2(1, 0))});
4630#if HWY_TARGET == HWY_SSSE3
4642#if HWY_TARGET == HWY_SSSE3
4652template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4655 const Half<
decltype(
d)> d2;
4659template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4662 const Half<
decltype(
d)> d2;
4666template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4668 const Vec128<T, N> lo) {
4669 const Half<
decltype(
d)> d2;
4673template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4676 const Half<
decltype(
d)> d2;
4683template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4689 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4693template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4697 alignas(16)
const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
4705template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4709 alignas(16)
const uint8_t kCompactOddU8[4] = {1, 3};
4717template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4724 return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4728template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4732 alignas(16)
const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
4740template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4744 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4745 _MM_SHUFFLE(3, 1, 3, 1))});
4754template <
typename T>
4763template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4767 const Vec128<uint16_t> mask =
Set(dw, 0x00FF);
4768 const Vec128<uint16_t> uH =
And(
BitCast(dw, hi), mask);
4769 const Vec128<uint16_t> uL =
And(
BitCast(dw, lo), mask);
4770 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4774template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4778 alignas(16)
const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
4786template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4790 alignas(16)
const uint8_t kCompactEvenU8[4] = {0, 2};
4798template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4800#if HWY_TARGET <= HWY_SSE4
4803 const Vec128<uint32_t> mask =
Set(dw, 0x0000FFFF);
4804 const Vec128<uint32_t> uH =
And(
BitCast(dw, hi), mask);
4805 const Vec128<uint32_t> uL =
And(
BitCast(dw, lo), mask);
4806 return Vec128<T>{_mm_packus_epi32(uL.raw, uH.raw)};
4810 alignas(16)
const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
4811 const Vec128<T> shuf =
BitCast(
d,
Load(
d, kCompactEvenU16));
4819template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4823 alignas(16)
const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
4831template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4835 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4836 _MM_SHUFFLE(2, 0, 2, 0))});
4844template <
typename T>
4852template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4854 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4859 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4862template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4869template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4871 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4876 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4879template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4886template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4890 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4891 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4895template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4897#if HWY_TARGET == HWY_SSSE3
4900 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4901 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4904 return Vec128<T, N>{_mm_blend_epi16(
a.raw, b.raw, 0x55)};
4908template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4910#if HWY_TARGET == HWY_SSSE3
4911 const __m128i odd = _mm_shuffle_epi32(
a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4912 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4913 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4923template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4929#if HWY_TARGET == HWY_SSSE3
4931 d, Vec128<double, N>{_mm_shuffle_pd(
4941HWY_API Vec128<float, N>
OddEven(Vec128<float, N>
a, Vec128<float, N> b) {
4942#if HWY_TARGET == HWY_SSSE3
4945 const __m128 odd = _mm_shuffle_ps(
a.raw,
a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4946 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4947 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4949 return Vec128<float, N>{_mm_blend_ps(
a.raw, b.raw, 5)};
4954template <
typename T,
size_t N>
4961template <
typename T,
size_t N>
4973#if HWY_TARGET > HWY_AVX3
4976template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4980 const Rebind<float,
decltype(dw)> df;
4981 const auto zero =
Zero(
d);
4984 const auto upper = exp +
Set(
d, 0x3F80);
4986 const auto f0 =
ZipLower(dw, zero, upper);
4987 const auto f1 =
ZipUpper(dw, zero, upper);
4989 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(
BitCast(df, f0).raw)};
4990 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(
BitCast(df, f1).raw)};
4991 return Vec128<MakeUnsigned<T>,
N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4995template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4999 const auto f = exp +
Set(
d, 0x3F800000);
5003 return Vec128<MakeUnsigned<T>,
N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
5011#if HWY_TARGET <= HWY_AVX3
5025#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5038#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5041 const __m128i bits1 = _mm_unpackhi_epi64(
bits.raw,
bits.raw);
5054template <
typename T,
size_t N>
5065template <
typename T,
size_t N>
5080#if HWY_TARGET <= HWY_AVX3
5098#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5105 const auto mul = detail::Pow2(
Set(d32, 32) -
bits);
5123 const Vec128<uint64_t>
bits) {
5124#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5126 const Vec128<uint64_t> out0{_mm_srl_epi64(
v.raw,
bits.raw)};
5127 const __m128i bits1 = _mm_unpackhi_epi64(
bits.raw,
bits.raw);
5128 const Vec128<uint64_t> out1{_mm_srl_epi64(
v.raw, bits1)};
5131 return Vec128<uint64_t>{_mm_srlv_epi64(
v.raw,
bits.raw)};
5139#if HWY_TARGET > HWY_AVX3
5143template <
class DI,
class V>
5144HWY_INLINE V SignedShr(
const DI di,
const V
v,
const V count_i) {
5146 const auto count =
BitCast(du, count_i);
5150 const auto abs =
BitCast(du,
v ^ sign);
5151 return BitCast(di, abs >> count) ^ sign;
5160#if HWY_TARGET <= HWY_AVX3
5174#if HWY_TARGET <= HWY_AVX3
5188#if HWY_TARGET <= HWY_AVX3
5198 const Vec128<uint64_t> b) {
5199 alignas(16) uint64_t mul[2];
5205 const Vec128<uint64_t> b) {
5206 alignas(16) uint64_t mul[2];
5215template <
class V,
size_t N,
class D16 = Simd<b
float16_t, 2 * N, 0>>
5223 using VU32 =
VFromD<
decltype(du32)>;
5224 const VU32 odd =
Set(du32, 0xFFFF0000u);
5236 Simd<int32_t, N, 0> , Vec128<int16_t, 2 * N>
a,
5237 Vec128<int16_t, 2 * N> b,
const Vec128<int32_t, N> sum0,
5238 Vec128<int32_t, N>& ) {
5239 return sum0 + Vec128<int32_t, N>{_mm_madd_epi16(
a.raw, b.raw)};
5245 Vec128<int32_t, N> ) {
5251 return Add(sum0, sum1);
5262#if HWY_TARGET == HWY_SSSE3
5263 const __m128i zero = _mm_setzero_si128();
5271 const Vec128<uint16_t, N>
v) {
5272#if HWY_TARGET == HWY_SSSE3
5273 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(
v.raw, _mm_setzero_si128())};
5275 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(
v.raw)};
5281#if HWY_TARGET == HWY_SSSE3
5290#if HWY_TARGET == HWY_SSSE3
5291 const __m128i zero = _mm_setzero_si128();
5292 const __m128i u16 = _mm_unpacklo_epi8(
v.raw, zero);
5319 const Vec128<int8_t, N>
v) {
5320#if HWY_TARGET == HWY_SSSE3
5321 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(
v.raw,
v.raw)});
5323 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(
v.raw)};
5328 const Vec128<int16_t, N>
v) {
5329#if HWY_TARGET == HWY_SSSE3
5330 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(
v.raw,
v.raw)});
5332 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(
v.raw)};
5337 const Vec128<int32_t, N>
v) {
5338#if HWY_TARGET == HWY_SSSE3
5339 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(
v.raw,
v.raw)});
5341 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(
v.raw)};
5346 const Vec128<int8_t, N>
v) {
5347#if HWY_TARGET == HWY_SSSE3
5348 const __m128i x2 = _mm_unpacklo_epi8(
v.raw,
v.raw);
5349 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
5352 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(
v.raw)};
5358#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
5359#define HWY_INLINE_F16 HWY_NOINLINE
5361#define HWY_INLINE_F16 HWY_INLINE
5366#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5373 const auto mantissa = bits16 &
Set(du32, 0x3FF);
5374 const auto subnormal =
5376 Set(df32, 1.0f / 16384 / 1024));
5378 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
5379 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
5380 const auto normal =
ShiftLeft<23>(biased_exp32) | mantissa32;
5381 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
5391 const Vec128<bfloat16_t, N>
v) {
5392 const Rebind<uint16_t,
decltype(df32)> du16;
5413 const Vec128<int32_t, N>
v) {
5414#if HWY_TARGET == HWY_SSSE3
5415 const Simd<int32_t, N, 0> di32;
5416 const Simd<uint16_t, N * 2, 0> du16;
5419 const auto clamped =
Or(zero_if_neg, too_big);
5421 alignas(16)
constexpr uint16_t kLower2Bytes[16] = {
5422 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
5423 const auto lo2 =
Load(du16, kLower2Bytes);
5426 return Vec128<uint16_t, N>{_mm_packus_epi32(
v.raw,
v.raw)};
5432 const Vec128<int32_t, N>
v) {
5433 return Vec128<int16_t, N>{_mm_packs_epi32(
v.raw,
v.raw)};
5438 const Vec128<int32_t, N>
v) {
5439 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
5440 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
5445 const Vec128<int16_t, N>
v) {
5446 return Vec128<uint8_t, N>{_mm_packus_epi16(
v.raw,
v.raw)};
5451 const Vec128<int32_t, N>
v) {
5452 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
5453 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
5458 const Vec128<int16_t, N>
v) {
5459 return Vec128<int8_t, N>{_mm_packs_epi16(
v.raw,
v.raw)};
5469 const Vec128<float, N>
v) {
5470#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5472 const Rebind<uint32_t,
decltype(df16)> du;
5474 const auto bits32 =
BitCast(du,
v);
5477 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
5479 const auto k15 =
Set(di, 15);
5480 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
5481 const auto is_tiny = exp <
Set(di, -24);
5483 const auto is_subnormal = exp <
Set(di, -14);
5484 const auto biased_exp16 =
5486 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
5487 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
5488 (mantissa32 >> (
Set(du, 13) + sub_exp));
5493 const auto normal16 = sign16 |
ShiftLeft<10>(biased_exp16) | mantissa16;
5498 return Vec128<float16_t, N>{_mm_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
5506 const Vec128<float, N>
v) {
5508 const Rebind<int32_t,
decltype(dbf16)> di32;
5509 const Rebind<uint32_t,
decltype(dbf16)> du32;
5510 const Rebind<uint16_t,
decltype(dbf16)> du16;
5517 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N>
a, Vec128<float, N> b) {
5520 const Repartition<uint32_t,
decltype(dbf16)> du32;
5527 Vec128<int32_t, 1>
a,
5528 Vec128<int32_t, 1> b) {
5529 const Half<
decltype(dn)> dnh;
5531 const Vec128<int16_t, 2> an{
DemoteTo(dnh,
a).raw};
5532 const Vec128<int16_t, 2> bn{
DemoteTo(dnh, b).raw};
5536 Vec128<int32_t, 2>
a,
5537 Vec128<int32_t, 2> b) {
5538 const Half<
decltype(dn)> dnh;
5540 const Vec128<int16_t, 4> an{
DemoteTo(dnh,
a).raw};
5541 const Vec128<int16_t, 4> bn{
DemoteTo(dnh, b).raw};
5545 Vec128<int32_t>
a, Vec128<int32_t> b) {
5546 return Vec128<int16_t>{_mm_packs_epi32(
a.raw, b.raw)};
5551 const Vec128<double, N>
v) {
5552 return Vec128<float, N>{_mm_cvtpd_ps(
v.raw)};
5561 ->
decltype(
Zero(
d)) {
5564 return Min(
v,
Set(
d, 2147483647.0));
5570template <
class DI,
class DF = RebindToFloat<DI>>
5572 decltype(
Zero(di).raw) converted_raw)
5579 const auto converted =
VFromD<DI>{converted_raw};
5580 const auto sign_wrong =
AndNot(
BitCast(di, original), converted);
5581#if HWY_COMPILER_GCC_ACTUAL
5597 const Vec128<double, N>
v) {
5599 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
5605 const Simd<uint32_t, N, 0> d32;
5606 const Simd<uint8_t, N * 4, 0> d8;
5607 alignas(16)
static constexpr uint32_t k8From32[4] = {
5608 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
5616template <
typename From,
typename To,
5627 const Vec128<uint64_t, 2>
v) {
5629 alignas(16)
static constexpr uint8_t kMap[16] = {0, 8, 0, 8, 0, 8, 0, 8,
5630 0, 8, 0, 8, 0, 8, 0, 8};
5635 const Vec128<uint64_t, 2>
v) {
5637 alignas(16)
static constexpr uint16_t kMap[8] = {
5638 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
5643 const Vec128<uint64_t, 2>
v) {
5644 return Vec128<uint32_t, 2>{_mm_shuffle_epi32(
v.raw, 0x88)};
5647template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
5649 const Vec128<uint32_t, N>
v) {
5651 alignas(16)
static constexpr uint8_t kMap[16] = {
5652 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
5653 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
5657template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
5659 const Vec128<uint32_t, N>
v) {
5665template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
5667 const Vec128<uint16_t, N>
v) {
5677 const Vec128<int32_t, N>
v) {
5678 return Vec128<float, N>{_mm_cvtepi32_ps(
v.raw)};
5684#if HWY_TARGET <= HWY_AVX3
5691 const auto msk_lo =
Set(du32, 0xFFFF);
5692 const auto cnst2_16_flt =
Set(df, 65536.0f);
5704#if HWY_TARGET <= HWY_AVX3
5713 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
5717 const auto k52 =
Set(d32, 0x43300000);
5720 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
5721 return (v_upper - k84_63_52) + v_lower;
5728#if HWY_TARGET <= HWY_AVX3
5733 using VU =
VFromD<
decltype(d64)>;
5735 const VU msk_lo =
Set(d64, 0xFFFFFFFF);
5736 const auto cnst2_32_dbl =
Set(dd, 4294967296.0);
5739 const VU v_lo =
And(
v, msk_lo);
5742 auto uint64_to_double128_fast = [&dd](VU
w)
HWY_ATTR {
5744 return BitCast(dd,
w) -
Set(dd, 0x0010000000000000);
5747 const auto v_lo_dbl = uint64_to_double128_fast(v_lo);
5748 return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl);
5755 const Vec128<float, N>
v) {
5761#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
5763#elif HWY_ARCH_X86_64
5764 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
v.raw));
5766 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
UpperHalf(dd2,
v).raw));
5769 using VI =
VFromD<
decltype(di)>;
5770 const VI k0 =
Zero(di);
5771 const VI k1 =
Set(di, 1);
5772 const VI k51 =
Set(di, 51);
5776 const VI exp = biased_exp -
Set(di, 0x3FF);
5777 const auto in_range = exp <
Set(di, 63);
5785 const VI shift_mnt =
Max(k51 - exp, k0);
5786 const VI shift_int =
Max(exp - k51, k0);
5787 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
5789 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
5791 const VI shifted = int52 << shift_int;
5793 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
5798 const VI magnitude =
IfThenElse(in_range, restored, limit);
5801 return (magnitude ^ sign_mask) - sign_mask;
5806#if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
5818 const Simd<int32_t, N, 0> di;
5824#if HWY_TARGET == HWY_SSSE3
5827template <
typename T,
size_t N>
5829 static_assert(
IsFloat<T>(),
"Only for float");
5833 const Simd<T, N, 0> df;
5836 const auto added = large +
v;
5837 const auto rounded = added - large;
5847template <
typename T,
size_t N>
5849 static_assert(
IsFloat<T>(),
"Only for float");
5856template <
typename T,
size_t N>
5858 static_assert(
IsFloat<T>(),
"Only for float");
5859 const Simd<T, N, 0> df;
5863 const auto int_f =
ConvertTo(df, integer);
5869template <
typename T,
size_t N>
5871 static_assert(
IsFloat<T>(),
"Only for float");
5876 const auto int_f =
ConvertTo(df, integer);
5885template <
typename T,
size_t N>
5887 static_assert(
IsFloat<T>(),
"Only for float");
5892 const auto int_f =
ConvertTo(df, integer);
5905 return Vec128<float, N>{
5906 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5910 return Vec128<double, N>{
5911 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5917 return Vec128<float, N>{
5918 _mm_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5922 return Vec128<double, N>{
5923 _mm_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5928HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
5929 return Vec128<float, N>{
5930 _mm_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5933HWY_API Vec128<double, N>
Ceil(
const Vec128<double, N>
v) {
5934 return Vec128<double, N>{
5935 _mm_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5941 return Vec128<float, N>{
5942 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5946 return Vec128<double, N>{
5947 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5956#if HWY_TARGET <= HWY_AVX3
5964#if HWY_TARGET <= HWY_AVX3
5971#if HWY_TARGET <= HWY_AVX3
5996template <
typename T,
size_t N>
5998 static_assert(
IsFloat<T>(),
"Only for float");
5999 const Simd<T, N, 0>
d;
6007template <
typename T,
size_t N>
6009 static_assert(
IsFloat<T>(),
"Only for float");
6010 const Simd<T, N, 0>
d;
6018 const VFromD<
decltype(di)> exp =
6027#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
6030#ifdef HWY_NATIVE_AES
6031#undef HWY_NATIVE_AES
6033#define HWY_NATIVE_AES
6037 Vec128<uint8_t> round_key) {
6038 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
6042 Vec128<uint8_t> round_key) {
6043 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
6046template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
6048 Vec128<uint64_t, N> b) {
6049 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(
a.raw, b.raw, 0x00)};
6052template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
6054 Vec128<uint64_t, N> b) {
6055 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(
a.raw, b.raw, 0x11)};
6064#if HWY_TARGET > HWY_AVX3
6067template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
6072 const Vec128<T, N> vbits{_mm_cvtsi32_si128(
static_cast<int>(mask_bits))};
6075 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
6076 1, 1, 1, 1, 1, 1, 1, 1};
6079 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6080 1, 2, 4, 8, 16, 32, 64, 128};
6084template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6087 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
6088 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
6092template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
6095 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
6096 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
6100template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6103 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
6111template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6114#if HWY_TARGET <= HWY_AVX3
6116 uint64_t mask_bits = 0;
6117 constexpr size_t kNumBytes = (
N + 7) / 8;
6120 mask_bits &= (1ull <<
N) - 1;
6125 uint64_t mask_bits = 0;
6126 constexpr size_t kNumBytes = (
N + 7) / 8;
6129 mask_bits &= (1ull <<
N) - 1;
6136template <
typename T>
6138#if HWY_TARGET <= HWY_AVX3
6144 enum {
value = (
sizeof(T) == 8) };
6147 enum {
value = (
sizeof(T) != 1) };
6151#if HWY_TARGET <= HWY_AVX3
6156template <
typename T,
size_t N>
6159 constexpr size_t kNumBytes = (
N + 7) / 8;
6164 const int mask_bits = (1 <<
N) - 1;
6165 bits[0] =
static_cast<uint8_t
>(
bits[0] & mask_bits);
6175template <
typename T,
size_t N>
6177 const Mask128<T, N> mask) {
6178 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
6182template <
typename T,
size_t N>
6184 const Mask128<T, N> mask) {
6185 const uint32_t mask_bits =
static_cast<uint32_t
>(mask.raw) & ((1u <<
N) - 1);
6189template <
typename T,
size_t N>
6191 const Mask128<T, N> mask) {
6192 const uint32_t mask_bits =
static_cast<uint32_t
>(mask.raw) & ((1u <<
N) - 1);
6196template <
typename T,
size_t N>
6197HWY_API bool AllFalse(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6198 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
6199 return mask_bits == 0;
6202template <
typename T,
size_t N>
6203HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6204 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
6206 return mask_bits == (1u <<
N) - 1;
6214template <
typename T>
6219template <
size_t N, HWY_IF_GE64(
float, N)>
6224template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6229 alignas(16)
constexpr uint8_t u8_indices[64] = {
6230 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6231 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6232 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6233 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6237 const auto index =
Load(d8, u8_indices + 16 * mask.raw);
6244template <
typename T>
6249template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6252 alignas(16)
constexpr uint64_t packed_array[16] = {0x00000010, 0x00000001,
6253 0x00000010, 0x00000010};
6259 const auto packed =
Set(du64, packed_array[mask.raw]);
6260 alignas(16)
constexpr uint64_t shifts[2] = {0, 4};
6261 const auto indices = Indices128<T>{(packed >>
Load(du64, shifts)).raw};
6267 Mask128<uint64_t> ) {
6273template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
6277 _mm_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
6278 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
6283template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6287 _mm_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
6288 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
6293template <
size_t N, HWY_IF_LE128(
float, N)>
6297 _mm_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
6298 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
6303template <
size_t N, HWY_IF_LE128(
double, N)>
6307 _mm_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
6308 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
6314template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6322 if (
N != 16 /
sizeof(T)) {
6328 const Vec128<T, N> compressed =
Compress(
v, m);
6329#if HWY_MEM_OPS_MIGHT_FAULT
6332 alignas(16) T
buf[
N];
6334 memcpy(unaligned,
buf, count *
sizeof(T));
6345template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6358constexpr HWY_INLINE uint64_t U64FromInt(
int mask_bits) {
6359 return static_cast<uint64_t
>(
static_cast<unsigned>(mask_bits));
6362template <
typename T,
size_t N>
6364 const Mask128<T, N> mask) {
6365 const Simd<T, N, 0>
d;
6367 return U64FromInt(_mm_movemask_epi8(sign_bits));
6370template <
typename T,
size_t N>
6372 const Mask128<T, N> mask) {
6374 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
6375 return U64FromInt(_mm_movemask_epi8(sign_bits));
6378template <
typename T,
size_t N>
6380 const Mask128<T, N> mask) {
6381 const Simd<T, N, 0>
d;
6382 const Simd<float, N, 0> df;
6384 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
6387template <
typename T,
size_t N>
6389 const Mask128<T, N> mask) {
6390 const Simd<T, N, 0>
d;
6391 const Simd<double, N, 0> df;
6393 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
6397template <
typename T,
size_t N>
6398constexpr uint64_t
OnlyActive(uint64_t mask_bits) {
6399 return ((
N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull <<
N) - 1);
6402template <
typename T,
size_t N>
6410template <
typename T,
size_t N>
6412 const Mask128<T, N> mask, uint8_t*
bits) {
6413 constexpr size_t kNumBytes = (
N + 7) / 8;
6421template <
typename T,
size_t N>
6422HWY_API bool AllFalse(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6427template <
typename T,
size_t N>
6428HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6429 constexpr uint64_t kAllBits =
6434template <
typename T,
size_t N>
6436 const Mask128<T, N> mask) {
6440template <
typename T,
size_t N>
6442 const Mask128<T, N> mask) {
6447template <
typename T,
size_t N>
6449 const Mask128<T, N> mask) {
6459template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6460HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6462 const Rebind<uint8_t,
decltype(
d)> d8;
6463 const Simd<uint16_t, N, 0> du;
6473 alignas(16)
constexpr uint8_t table[2048] = {
6475 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6476 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6477 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
6478 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6479 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
6480 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
6481 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
6482 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6483 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
6484 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
6485 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
6486 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
6487 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
6488 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
6489 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
6490 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6491 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
6492 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
6493 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
6494 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
6495 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
6496 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
6497 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
6498 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
6499 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
6500 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
6501 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
6502 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
6503 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
6504 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
6505 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
6506 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6507 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
6508 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
6509 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
6510 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
6511 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
6512 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
6513 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
6514 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
6515 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
6516 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
6517 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
6518 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
6519 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
6520 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
6521 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
6522 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
6523 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
6524 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
6525 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
6526 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
6527 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
6528 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
6529 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
6530 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
6531 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
6532 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
6533 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
6534 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
6535 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
6536 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
6537 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
6538 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6539 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
6540 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
6541 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
6542 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
6543 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
6544 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
6545 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
6546 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
6547 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
6548 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
6549 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
6550 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
6551 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
6552 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
6553 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
6554 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
6555 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
6556 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
6557 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
6558 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
6559 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
6560 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
6561 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
6562 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
6563 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
6564 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
6565 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
6566 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
6567 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
6568 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
6569 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
6570 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
6571 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
6572 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
6573 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
6574 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
6575 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
6576 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
6577 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
6578 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
6579 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
6580 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
6581 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
6582 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
6583 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
6584 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
6585 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
6586 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
6587 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
6588 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
6589 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
6590 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
6591 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
6592 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
6593 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
6594 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
6595 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
6596 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
6597 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
6598 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
6599 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
6600 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
6601 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
6602 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
6604 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
6605 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
6609template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6610HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6611 uint64_t mask_bits) {
6613 const Rebind<uint8_t,
decltype(
d)> d8;
6614 const Simd<uint16_t, N, 0> du;
6624 alignas(16)
constexpr uint8_t table[2048] = {
6626 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
6627 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
6628 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
6629 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
6630 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
6631 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
6632 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
6633 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
6634 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
6635 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
6636 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
6637 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
6638 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
6639 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
6640 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
6641 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
6642 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
6643 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
6644 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
6645 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
6646 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
6647 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
6648 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
6649 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
6650 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
6651 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
6652 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
6653 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
6654 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
6655 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
6656 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
6657 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
6658 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
6659 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
6660 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
6661 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
6662 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
6663 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
6664 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
6665 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
6666 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
6667 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
6668 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
6669 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
6670 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
6671 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
6672 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
6673 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
6674 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
6675 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
6676 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
6677 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
6678 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
6679 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
6680 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
6681 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
6682 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
6683 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
6684 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
6685 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
6686 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
6687 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
6688 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
6689 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
6690 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
6691 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
6692 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
6693 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
6694 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
6695 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
6696 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
6697 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
6698 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
6699 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
6700 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
6701 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
6702 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
6703 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
6704 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
6705 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
6706 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
6707 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
6708 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
6709 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
6710 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
6711 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
6712 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
6713 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
6714 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
6715 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
6716 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
6717 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
6718 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
6719 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
6720 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
6721 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
6722 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
6723 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
6724 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
6725 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
6726 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
6727 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
6728 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
6729 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
6730 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
6731 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
6732 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
6733 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
6734 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
6735 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
6736 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
6737 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
6738 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
6739 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
6740 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
6741 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
6742 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
6743 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
6744 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
6745 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
6746 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
6747 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
6748 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
6749 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
6750 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
6751 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
6752 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
6753 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6755 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
6756 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
6760template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6761HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6765 alignas(16)
constexpr uint8_t u8_indices[256] = {
6767 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6769 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
6770 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6771 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
6772 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
6773 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
6774 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6775 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6776 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
6777 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
6778 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6779 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6780 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
6781 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
6782 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6785 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6788template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6789HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6790 uint64_t mask_bits) {
6794 alignas(16)
constexpr uint8_t u8_indices[256] = {
6796 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6797 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6798 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6799 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6800 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6801 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6802 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6803 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6804 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6805 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6806 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6807 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6808 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6809 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6813 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6816template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6817HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6821 alignas(16)
constexpr uint8_t u8_indices[64] = {
6823 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6824 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6825 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6826 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6829 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6832template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6833HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6834 uint64_t mask_bits) {
6838 alignas(16)
constexpr uint8_t u8_indices[64] = {
6840 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6841 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6842 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6843 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6846 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6849template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6851 const Simd<T, N, 0>
d;
6855 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6859template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6860HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N>
v, uint64_t mask_bits) {
6861 const Simd<T, N, 0>
d;
6865 const auto indices =
BitCast(du, detail::IndicesFromNotBits(
d, mask_bits));
6872template <
typename T>
6878template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6883 const Vec128<T> maskL =
DupEven(m);
6884 const Vec128<T> maskH =
DupOdd(m);
6885 const Vec128<T> swap =
AndNot(maskL, maskH);
6890template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6896template <
typename T>
6902template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6907 const Vec128<T> maskL =
DupEven(m);
6908 const Vec128<T> maskH =
DupOdd(m);
6909 const Vec128<T> swap =
AndNot(maskH, maskL);
6914template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6918 if (
N < 16 /
sizeof(T)) {
6926 Mask128<uint64_t> ) {
6930template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6933 uint64_t mask_bits = 0;
6934 constexpr size_t kNumBytes = (
N + 7) / 8;
6937 mask_bits &= (1ull <<
N) - 1;
6940 return detail::CompressBits(
v, mask_bits);
6945template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6952 const size_t count =
PopCount(mask_bits);
6955 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6957 StoreU(compressed,
d, unaligned);
6962template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6970 const size_t count =
PopCount(mask_bits);
6973 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6980template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6986 uint64_t mask_bits = 0;
6987 constexpr size_t kNumBytes = (
N + 7) / 8;
6990 mask_bits &= (1ull <<
N) - 1;
6992 const size_t count =
PopCount(mask_bits);
6995 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6997 StoreU(compressed,
d, unaligned);
7015template <
typename T>
7017 const Vec128<T, 1>
v) {
7020template <
typename T>
7022 const Vec128<T, 1>
v) {
7025template <
typename T>
7027 const Vec128<T, 1>
v) {
7034template <
typename T>
7036 const Vec128<T, 2> v10) {
7039template <
typename T>
7041 const Vec128<T, 2> v10) {
7044template <
typename T>
7046 const Vec128<T, 2> v10) {
7051template <
typename T>
7053 const Vec128<T> v3210) {
7055 const Vec128<T> v31_20_31_20 = v3210 + v1032;
7056 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
7057 return v20_31_20_31 + v31_20_31_20;
7059template <
typename T>
7061 const Vec128<T> v3210) {
7063 const Vec128<T> v31_20_31_20 =
Min(v3210, v1032);
7064 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
7065 return Min(v20_31_20_31, v31_20_31_20);
7067template <
typename T>
7069 const Vec128<T> v3210) {
7071 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
7072 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
7073 return Max(v20_31_20_31, v31_20_31_20);
7079template <
typename T>
7081 const Vec128<T> v10) {
7085template <
typename T>
7087 const Vec128<T> v10) {
7089 return Min(v10, v01);
7091template <
typename T>
7093 const Vec128<T> v10) {
7095 return Max(v10, v01);
7098template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
7100 Vec128<uint16_t, N>
v) {
7101 const Simd<uint16_t, N, 0>
d;
7109template <
size_t N, HWY_IF_GE32(
int16_t, N)>
7111 Vec128<int16_t, N>
v) {
7112 const Simd<int16_t, N, 0>
d;
7131 return Set(
d,
static_cast<uint8_t
>(
GetLane(sums) & 0xFF));
7134template <
size_t N, HWY_IF_GE64(
int8_t, N)>
7139 const auto is_neg =
v <
Zero(
d);
7149#if HWY_TARGET <= HWY_SSE4
7152 using V =
decltype(
v);
7180#elif HWY_TARGET == HWY_SSSE3
7181template <
size_t N, HWY_IF_GE64(u
int8_t, N)>
7197template <
size_t N, HWY_IF_GE64(u
int8_t, N)>
7199 const Vec128<uint8_t, N>
v) {
7215template <
size_t N, HWY_IF_GE64(
int8_t, N)>
7220 const auto mask =
SignBit(du);
7224template <
size_t N, HWY_IF_GE64(
int8_t, N)>
7229 const auto mask =
SignBit(du);
7234template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
7245template <
size_t N, HWY_IF_GE32(
int16_t, N)>
7247 Vec128<int16_t, N>
v) {
7248 const Simd<int16_t, N, 0>
d;
7258template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
7260 Vec128<uint16_t, N>
v) {
7261 const Simd<uint16_t, N, 0>
d;
7269template <
size_t N, HWY_IF_GE32(
int16_t, N)>
7271 Vec128<int16_t, N>
v) {
7272 const Simd<int16_t, N, 0>
d;
7285template <
typename T,
size_t N>
7289template <
typename T,
size_t N>
7293template <
typename T,
size_t N>
7303template <
class D,
class V = VFromD<D>>
7320 const auto eqHL = Eq(
a, b);
7323 const V vecHx =
IfThenElse(eqHL, ltLX, ltHL);
7328template <
class D,
class V = VFromD<D>>
7334 return And(eqHL, eqLH);
7337template <
class D,
class V = VFromD<D>>
7343 return Or(neHL, neLH);
7346template <
class D,
class V = VFromD<D>>
7354template <
class D,
class V = VFromD<D>>
7362template <
class D,
class V = VFromD<D>>
7372template <
class D,
class V = VFromD<D>>
7377template <
class D,
class V = VFromD<D>>
7382template <
class D,
class V = VFromD<D>>
7387template <
class D,
class V = VFromD<D>>
7392template <
class D,
class V = VFromD<D>>
7397template <
class D,
class V = VFromD<D>>
7405template <
class D,
class V = VFromD<D>>
7410template <
class D,
class V = VFromD<D>>
7415template <
class D,
class V = VFromD<D>>
7420template <
class D,
class V = VFromD<D>>
uint8_t buf
Definition BitIO.h:84
size_t offset
Definition BitIO.h:80
uint32_t x
Definition BlockExec.h:38
uint8_t * bits
Definition TileProcessor.h:59
#define HWY_RESTRICT
Definition base.h:64
#define HWY_IF_LANE_SIZE(T, bytes)
Definition base.h:420
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_IF_LE64(T, N)
Definition base.h:407
#define HWY_API
Definition base.h:129
#define HWY_IF_LE128(T, N)
Definition base.h:406
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_MAYBE_UNUSED
Definition base.h:82
#define HWY_ASSERT(condition)
Definition base.h:192
Definition x86_128-inl.h:137
Raw raw
Definition arm_neon-inl.h:835
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:827
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition x86_128-inl.h:140
Definition x86_128-inl.h:70
T PrivateT
Definition arm_neon-inl.h:782
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition x86_128-inl.h:82
Raw raw
Definition arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition x86_128-inl.h:88
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition x86_128-inl.h:97
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition x86_128-inl.h:94
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition x86_128-inl.h:79
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:779
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition x86_128-inl.h:91
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition x86_128-inl.h:85
#define HWY_AVX3_DL
Definition detect_targets.h:65
#define HWY_TARGET
Definition detect_targets.h:380
uint32_t a
only used by MQ decoder
Definition mqc.h:48
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:726
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2413
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:718
HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b)
Definition x86_128-inl.h:7355
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition emu128-inl.h:633
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition x86_128-inl.h:5571
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:1570
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:963
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition x86_128-inl.h:2205
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition x86_256-inl.h:2612
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition x86_128-inl.h:5009
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition x86_128-inl.h:3286
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition wasm_128-inl.h:1688
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition wasm_128-inl.h:1844
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition x86_128-inl.h:3275
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:663
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b)
Definition x86_128-inl.h:7363
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:545
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b)
Definition x86_128-inl.h:7347
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3418
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3051
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:671
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:760
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:3147
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:2990
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
HWY_INLINE Vec128< T > ZeroExtendVector(hwy::NonFloatTag, Full128< T >, Vec64< T > lo)
Definition x86_128-inl.h:4567
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:5589
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3038
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition arm_neon-inl.h:5364
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:3051
static bool SignBit(float f)
Definition scalar-inl.h:601
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:821
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:818
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
Simd< T, 8/sizeof(T), 0 > Full64
Definition ops/shared-inl.h:240
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition ops/shared-inl.h:271
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
long long int GatherIndex64
Definition x86_128-inl.h:3268
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
typename D::Twice Twice
Definition ops/shared-inl.h:231
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
constexpr T MantissaEnd()
Definition base.h:753
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
constexpr MakeSigned< T > MaxExponentTimes2()
Definition base.h:728
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:924
HWY_API constexpr bool IsSigned()
Definition base.h:642
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:961
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:607
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:627
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed+R::is_float)<< 8)>
Definition base.h:619
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
constexpr int MantissaBits()
Definition base.h:712
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:796
constexpr MakeSigned< T > MaxExponentField()
Definition base.h:778
typename EnableIfT< Condition >::type EnableIf
Definition base.h:383
HWY_API constexpr bool IsFloat()
Definition base.h:635
HWY_API constexpr T LimitsMax()
Definition base.h:656
typename detail::Relations< T >::Wide MakeWide
Definition base.h:601
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
#define HWY_ATTR
Definition set_macros-inl.h:443
Definition x86_128-inl.h:6137
@ value
Definition arm_neon-inl.h:5730
Definition x86_128-inl.h:4130
__m128i raw
Definition x86_128-inl.h:4131
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:3969
Definition ops/shared-inl.h:52
HWY_INLINE __m128d operator()(__m128i v)
Definition x86_128-inl.h:187
HWY_INLINE __m128 operator()(__m128i v)
Definition x86_128-inl.h:183
Definition x86_128-inl.h:178
HWY_INLINE __m128i operator()(__m128i v)
Definition x86_128-inl.h:179
__m128d type
Definition x86_128-inl.h:64
__f32x4 type
Definition wasm_128-inl.h:65
Definition x86_128-inl.h:55
__v128_u type
Definition wasm_128-inl.h:61
__mmask16 type
Definition x86_128-inl.h:119
__mmask8 type
Definition x86_128-inl.h:123
__mmask8 type
Definition x86_128-inl.h:127
__mmask8 type
Definition x86_128-inl.h:131
Definition x86_128-inl.h:116
uint32_t x1
Definition t1_common.h:75
#define HWY_INLINE_F16
Definition x86_128-inl.h:5361