29#if HWY_COMPILER_GCC_ACTUAL
37#if HWY_COMPILER_CLANGCL
43#include <avx2intrin.h>
44#include <bmi2intrin.h>
45#include <f16cintrin.h>
55#include <sanitizer/msan_interface.h>
87 static constexpr size_t kPrivateN = 32 /
sizeof(T);
92 return *
this = (*
this * other);
95 return *
this = (*
this / other);
98 return *
this = (*
this + other);
101 return *
this = (*
this - other);
104 return *
this = (*
this & other);
107 return *
this = (*
this | other);
110 return *
this = (*
this ^ other);
116#if HWY_TARGET <= HWY_AVX3
121template <
size_t size>
164using Full256 = Simd<T, 32 /
sizeof(T), 0>;
173 return _mm256_castpd_si256(
v);
202template <
typename T,
typename FromT>
212 return Vec256<T>{_mm256_setzero_si256()};
233 _mm256_set1_epi64x(
static_cast<long long>(t))};
246 _mm256_set1_epi64x(
static_cast<long long>(t))};
263 return Vec256<T>{_mm256_undefined_si256()};
280 return Vec256<T>{_mm256_and_si256(
a.raw, b.raw)};
294HWY_API Vec256<T>
AndNot(Vec256<T> not_mask, Vec256<T> mask) {
295 return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
309HWY_API Vec256<T>
Or(Vec256<T>
a, Vec256<T> b) {
310 return Vec256<T>{_mm256_or_si256(
a.raw, b.raw)};
324 return Vec256<T>{_mm256_xor_si256(
a.raw, b.raw)};
338#if HWY_TARGET <= HWY_AVX3
341 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
349HWY_API Vec256<T>
Xor3(Vec256<T>
x1, Vec256<T> x2, Vec256<T> x3) {
350#if HWY_TARGET <= HWY_AVX3
353 using VU =
VFromD<
decltype(du)>;
354 const __m256i ret = _mm256_ternarylogic_epi64(
364HWY_API Vec256<T>
Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
365#if HWY_TARGET <= HWY_AVX3
368 using VU =
VFromD<
decltype(du)>;
369 const __m256i ret = _mm256_ternarylogic_epi64(
373 return Or(o1,
Or(o2, o3));
379HWY_API Vec256<T>
OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
380#if HWY_TARGET <= HWY_AVX3
383 using VU =
VFromD<
decltype(du)>;
384 const __m256i ret = _mm256_ternarylogic_epi64(
388 return Or(o,
And(a1, a2));
395#if HWY_TARGET <= HWY_AVX3
398 using VU =
VFromD<
decltype(du)>;
427#if HWY_TARGET == HWY_AVX3_DL
429#ifdef HWY_NATIVE_POPCNT
430#undef HWY_NATIVE_POPCNT
432#define HWY_NATIVE_POPCNT
468HWY_API Vec256<T>
CopySign(
const Vec256<T> magn,
const Vec256<T> sign) {
469 static_assert(
IsFloat<T>(),
"Only makes sense for floating-point");
474#if HWY_TARGET <= HWY_AVX3
486 const __m256i out = _mm256_ternarylogic_epi32(
496#if HWY_TARGET <= HWY_AVX3
506#if HWY_TARGET <= HWY_AVX3
639#if HWY_COMPILER_HAS_MASK_INTRINSICS
648#if HWY_COMPILER_HAS_MASK_INTRINSICS
657#if HWY_COMPILER_HAS_MASK_INTRINSICS
666#if HWY_COMPILER_HAS_MASK_INTRINSICS
676#if HWY_COMPILER_HAS_MASK_INTRINSICS
685#if HWY_COMPILER_HAS_MASK_INTRINSICS
694#if HWY_COMPILER_HAS_MASK_INTRINSICS
703#if HWY_COMPILER_HAS_MASK_INTRINSICS
713#if HWY_COMPILER_HAS_MASK_INTRINSICS
722#if HWY_COMPILER_HAS_MASK_INTRINSICS
731#if HWY_COMPILER_HAS_MASK_INTRINSICS
740#if HWY_COMPILER_HAS_MASK_INTRINSICS
750#if HWY_COMPILER_HAS_MASK_INTRINSICS
759#if HWY_COMPILER_HAS_MASK_INTRINSICS
768#if HWY_COMPILER_HAS_MASK_INTRINSICS
777#if HWY_COMPILER_HAS_MASK_INTRINSICS
787#if HWY_COMPILER_HAS_MASK_INTRINSICS
790 return Mask256<T>{
static_cast<__mmask32
>(~(
a.raw ^ b.
raw) & 0xFFFFFFFF)};
796#if HWY_COMPILER_HAS_MASK_INTRINSICS
799 return Mask256<T>{
static_cast<__mmask16
>(~(
a.raw ^ b.
raw) & 0xFFFF)};
805#if HWY_COMPILER_HAS_MASK_INTRINSICS
808 return Mask256<T>{
static_cast<__mmask8
>(~(
a.raw ^ b.
raw) & 0xFF)};
814#if HWY_COMPILER_HAS_MASK_INTRINSICS
815 return Mask256<T>{
static_cast<__mmask8
>(_kxnor_mask8(
a.raw, b.
raw) & 0xF)};
824HWY_API Mask256<T>
And(
const Mask256<T>
a, Mask256<T> b) {
834HWY_API Mask256<T>
Or(
const Mask256<T>
a, Mask256<T> b) {
839HWY_API Mask256<T>
Xor(
const Mask256<T>
a, Mask256<T> b) {
846 constexpr size_t N = 32 /
sizeof(T);
862 return Mask256<T>{
v.raw};
867 return Vec256<T>{
v.raw};
872 return Vec256<T>{
v.raw};
880 const Vec256<T> no) {
881 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
884 const Vec256<float> yes,
885 const Vec256<float> no) {
886 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
889 const Vec256<double> yes,
890 const Vec256<double> no) {
891 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
922HWY_API Mask256<T>
And(
const Mask256<T>
a, Mask256<T> b) {
934HWY_API Mask256<T>
Or(
const Mask256<T>
a, Mask256<T> b) {
940HWY_API Mask256<T>
Xor(
const Mask256<T>
a, Mask256<T> b) {
955#if HWY_TARGET <= HWY_AVX3
959template <
typename TFrom,
typename TTo>
961 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
962 return Mask256<TTo>{m.raw};
998template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1002template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1004 return Mask256<T>{_mm256_cmpeq_epi16_mask(
a.raw, b.raw)};
1006template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1008 return Mask256<T>{_mm256_cmpeq_epi32_mask(
a.raw, b.raw)};
1010template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1012 return Mask256<T>{_mm256_cmpeq_epi64_mask(
a.raw, b.raw)};
1025template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1029template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1031 return Mask256<T>{_mm256_cmpneq_epi16_mask(
a.raw, b.raw)};
1033template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1035 return Mask256<T>{_mm256_cmpneq_epi32_mask(
a.raw, b.raw)};
1037template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1039 return Mask256<T>{_mm256_cmpneq_epi64_mask(
a.raw, b.raw)};
1101template <
typename T>
1105template <
typename T>
1109template <
typename T>
1113template <
typename T>
1120template <
typename T>
1132template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1137template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1139 return Vec256<T>{_mm256_movm_epi16(
v.raw)};
1142template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1144 return Vec256<T>{_mm256_movm_epi32(
v.raw)};
1147template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1149 return Vec256<T>{_mm256_movm_epi64(
v.raw)};
1153 return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(
v.raw))};
1160template <
typename T>
1169template <
typename TFrom,
typename TTo>
1171 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1175template <
typename T>
1178 return (
v & bit) == bit;
1183template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1185 return Mask256<T>{_mm256_cmpeq_epi8(
a.raw, b.raw)};
1188template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1190 return Mask256<T>{_mm256_cmpeq_epi16(
a.raw, b.raw)};
1193template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1195 return Mask256<T>{_mm256_cmpeq_epi32(
a.raw, b.raw)};
1198template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1200 return Mask256<T>{_mm256_cmpeq_epi64(
a.raw, b.raw)};
1204 const Vec256<float> b) {
1205 return Mask256<float>{_mm256_cmp_ps(
a.raw, b.raw, _CMP_EQ_OQ)};
1209 const Vec256<double> b) {
1210 return Mask256<double>{_mm256_cmp_pd(
a.raw, b.raw, _CMP_EQ_OQ)};
1215template <
typename T>
1220 const Vec256<float> b) {
1221 return Mask256<float>{_mm256_cmp_ps(
a.raw, b.raw, _CMP_NEQ_OQ)};
1224 const Vec256<double> b) {
1225 return Mask256<double>{_mm256_cmp_pd(
a.raw, b.raw, _CMP_NEQ_OQ)};
1236#if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1237#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1239#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1244#if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1245 using i8x32 =
signed char __attribute__((__vector_size__(32)));
1246 return Mask256<int8_t>{
static_cast<__m256i
>(
reinterpret_cast<i8x32
>(
a.raw) >
1247 reinterpret_cast<i8x32
>(b.raw))};
1249 return Mask256<int8_t>{_mm256_cmpgt_epi8(
a.raw, b.raw)};
1253 Vec256<int16_t> b) {
1254 return Mask256<int16_t>{_mm256_cmpgt_epi16(
a.raw, b.raw)};
1257 Vec256<int32_t> b) {
1258 return Mask256<int32_t>{_mm256_cmpgt_epi32(
a.raw, b.raw)};
1261 Vec256<int64_t> b) {
1262 return Mask256<int64_t>{_mm256_cmpgt_epi64(
a.raw, b.raw)};
1265template <
typename T>
1275 return Mask256<float>{_mm256_cmp_ps(
a.raw, b.raw, _CMP_GT_OQ)};
1279 return Mask256<double>{_mm256_cmp_pd(
a.raw, b.raw, _CMP_GT_OQ)};
1284template <
typename T>
1292 const Vec256<float> b) {
1293 return Mask256<float>{_mm256_cmp_ps(
a.raw, b.raw, _CMP_GE_OQ)};
1296 const Vec256<double> b) {
1297 return Mask256<double>{_mm256_cmp_pd(
a.raw, b.raw, _CMP_GE_OQ)};
1304template <
typename T>
1309template <
typename T>
1330#if HWY_TARGET <= HWY_AVX3
1335 const auto msb =
Set(du, 1ull << 63);
1352#if HWY_TARGET <= HWY_AVX3
1383#if HWY_TARGET <= HWY_AVX3
1388 const auto msb =
Set(du, 1ull << 63);
1405#if HWY_TARGET <= HWY_AVX3
1422template <
typename T>
1424#if HWY_TARGET <= HWY_AVX3
1426 constexpr size_t N = 32 /
sizeof(T);
1428 const uint64_t all = (1ull <<
N) - 1;
1432 const uint32_t all =
static_cast<uint32_t
>((1ull <<
N) - 1);
1435 (n > 255) ? all : _bzhi_u32(all,
static_cast<uint32_t
>(n)));
1541 return Vec256<uint64_t>{_mm256_sad_epu8(
v.raw, _mm256_setzero_si256())};
1610#if HWY_COMPILER_MSVC
1667HWY_API Vec256<int64_t>
MulEven(Vec256<int32_t>
a, Vec256<int32_t> b) {
1668 return Vec256<int64_t>{_mm256_mul_epi32(
a.raw, b.raw)};
1670HWY_API Vec256<uint64_t>
MulEven(Vec256<uint32_t>
a, Vec256<uint32_t> b) {
1671 return Vec256<uint64_t>{_mm256_mul_epu32(
a.raw, b.raw)};
1706template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1713 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
1738 return shifted &
Set(d8, 0xFF >> kBits);
1756 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1757 return (shifted ^ shifted_sign) - shifted_sign;
1766 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1767#if HWY_TARGET <= HWY_AVX3
1770 if (kBits == 0)
return v;
1777 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1778#if HWY_TARGET <= HWY_AVX3
1781 if (kBits == 0)
return v;
1801#if HWY_TARGET == HWY_AVX2
1810#if HWY_TARGET <= HWY_AVX3
1817 return right | sign;
1822#if HWY_TARGET <= HWY_AVX3
1837template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1839 static_assert(
IsSigned<T>(),
"Only works for signed/float");
1848template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1850 static_assert(
IsSigned<T>(),
"Only works for signed/float");
1886template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1891 return shifted &
Set(d8,
static_cast<T
>((0xFF <<
bits) & 0xFF));
1913 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >>
bits));
1927#if HWY_TARGET <= HWY_AVX3
1934 return right | sign;
1942 const auto shifted_sign =
1944 return (shifted ^ shifted_sign) - shifted_sign;
1952template <
typename T>
1958template <
typename T>
1965template <
typename T>
1990 return Vec256<float>{_mm256_rcp_ps(
v.raw)};
1994HWY_API Vec256<float>
AbsDiff(
const Vec256<float>
a,
const Vec256<float> b) {
2001HWY_API Vec256<float>
MulAdd(
const Vec256<float> mul,
const Vec256<float>
x,
2002 const Vec256<float> add) {
2003#ifdef HWY_DISABLE_BMI2_FMA
2004 return mul *
x + add;
2006 return Vec256<float>{_mm256_fmadd_ps(mul.raw,
x.raw, add.raw)};
2011#ifdef HWY_DISABLE_BMI2_FMA
2012 return mul *
x + add;
2020 const Vec256<float> add) {
2021#ifdef HWY_DISABLE_BMI2_FMA
2022 return add - mul *
x;
2024 return Vec256<float>{_mm256_fnmadd_ps(mul.raw,
x.raw, add.raw)};
2030#ifdef HWY_DISABLE_BMI2_FMA
2031 return add - mul *
x;
2038HWY_API Vec256<float>
MulSub(
const Vec256<float> mul,
const Vec256<float>
x,
2039 const Vec256<float> sub) {
2040#ifdef HWY_DISABLE_BMI2_FMA
2041 return mul *
x - sub;
2043 return Vec256<float>{_mm256_fmsub_ps(mul.raw,
x.raw, sub.raw)};
2048#ifdef HWY_DISABLE_BMI2_FMA
2049 return mul *
x - sub;
2057 const Vec256<float> sub) {
2058#ifdef HWY_DISABLE_BMI2_FMA
2059 return Neg(mul *
x) - sub;
2061 return Vec256<float>{_mm256_fnmsub_ps(mul.raw,
x.raw, sub.raw)};
2067#ifdef HWY_DISABLE_BMI2_FMA
2068 return Neg(mul *
x) - sub;
2086 return Vec256<float>{_mm256_rsqrt_ps(
v.raw)};
2093 return Vec256<float>{
2094 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2098 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2103 return Vec256<float>{
2104 _mm256_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2108 _mm256_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2113 return Vec256<float>{
2114 _mm256_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2118 _mm256_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2123 return Vec256<float>{
2124 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2128 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2134#if HWY_TARGET <= HWY_AVX3
2141#if HWY_TARGET <= HWY_AVX3
2148#if HWY_TARGET <= HWY_AVX3
2168template <
typename T>
2170 static_assert(
IsFloat<T>(),
"Only for float");
2179template <
typename T>
2181 static_assert(
IsFloat<T>(),
"Only for float");
2190 const VFromD<
decltype(di)> exp =
2201template <
typename T>
2204 _mm256_load_si256(
reinterpret_cast<const __m256i*
>(aligned))};
2215template <
typename T>
2217 return Vec256<T>{_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(p))};
2230#if HWY_TARGET <= HWY_AVX3
2232template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2238template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2241 return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2244template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2247 return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2250template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2253 return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2269template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2270HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2271 const T* HWY_RESTRICT p) {
2272 return IfThenElseZero(m, LoadU(d, p));
2275template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2278 auto pi =
reinterpret_cast<const int*
>(p);
2279 return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2282template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2285 auto pi =
reinterpret_cast<const long long*
>(p);
2286 return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2291 const Vec256<int32_t> mi =
2293 return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2298 const Vec256<int64_t> mi =
2300 return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2309template <
typename T>
2311#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2319 _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2321 return Vec256<T>{_mm256_broadcastsi128_si256(
LoadU(
Full128<T>(), p).raw)};
2326#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2329 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2331 return Vec256<float>{_mm256_broadcast_ps(
reinterpret_cast<const __m128*
>(p))};
2336#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2339 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2342 _mm256_broadcast_pd(
reinterpret_cast<const __m128d*
>(p))};
2348template <
typename T>
2350 _mm256_store_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2354 _mm256_store_ps(aligned,
v.raw);
2358 _mm256_store_pd(aligned,
v.raw);
2361template <
typename T>
2363 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(p),
v.raw);
2367 _mm256_storeu_ps(p,
v.raw);
2371 _mm256_storeu_pd(p,
v.raw);
2376#if HWY_TARGET <= HWY_AVX3
2378template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2381 _mm256_mask_storeu_epi8(p, m.
raw,
v.raw);
2384template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2387 _mm256_mask_storeu_epi16(p, m.raw,
v.raw);
2390template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2393 _mm256_mask_storeu_epi32(p, m.raw,
v.raw);
2396template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2399 _mm256_mask_storeu_epi64(p, m.raw,
v.raw);
2404 _mm256_mask_storeu_ps(p, m.
raw,
v.raw);
2409 _mm256_mask_storeu_pd(p, m.
raw,
v.raw);
2423template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2424HWY_API
void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2425 T* HWY_RESTRICT p) {
2429 const RebindToUn
signed<decltype(d)> du;
2430 using TU = TFromD<decltype(du)>;
2431 alignas(32) TU buf[32 / sizeof(T)];
2432 alignas(32) TU mask[32 / sizeof(T)];
2433 Store(BitCast(du, v), du, buf);
2434 Store(BitCast(du, VecFromMask(d, m)), du, mask);
2435 for (
size_t i = 0; i < 32 / sizeof(T); ++i) {
2437 CopySameSize(buf + i, p + i);
2442template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2445 auto pi =
reinterpret_cast<int*
>(p);
2446 _mm256_maskstore_epi32(pi, m.raw,
v.raw);
2449template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2452 auto pi =
reinterpret_cast<long long*
>(p);
2453 _mm256_maskstore_epi64(pi, m.raw,
v.raw);
2458 const Vec256<int32_t> mi =
2460 _mm256_maskstore_ps(p, mi.raw,
v.raw);
2465 const Vec256<int64_t> mi =
2467 _mm256_maskstore_pd(p, mi.raw,
v.raw);
2474template <
typename T>
2477 _mm256_stream_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2481 _mm256_stream_ps(aligned,
v.raw);
2485 _mm256_stream_pd(aligned,
v.raw);
2494#if HWY_TARGET <= HWY_AVX3
2497template <
typename T>
2501 _mm256_i32scatter_epi32(base,
offset.raw,
v.raw, 1);
2503template <
typename T>
2507 _mm256_i32scatter_epi32(base, index.
raw,
v.raw, 4);
2510template <
typename T>
2514 _mm256_i64scatter_epi64(base,
offset.raw,
v.raw, 1);
2516template <
typename T>
2520 _mm256_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2525template <
typename T,
typename Offset>
2527 const Vec256<Offset>
offset) {
2528 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2531template <
typename T,
typename Index>
2533 const Vec256<Index> index) {
2534 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2541 _mm256_i32scatter_ps(base,
offset.raw,
v.raw, 1);
2546 _mm256_i32scatter_ps(base, index.
raw,
v.raw, 4);
2552 _mm256_i64scatter_pd(base,
offset.raw,
v.raw, 1);
2557 _mm256_i64scatter_pd(base, index.
raw,
v.raw, 8);
2562template <
typename T,
typename Offset>
2564 const Vec256<Offset>
offset) {
2565 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2567 constexpr size_t N = 32 /
sizeof(T);
2568 alignas(32) T lanes[
N];
2571 alignas(32) Offset offset_lanes[
N];
2574 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
2575 for (
size_t i = 0; i <
N; ++i) {
2580template <
typename T,
typename Index>
2582 const Vec256<Index> index) {
2583 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2585 constexpr size_t N = 32 /
sizeof(T);
2586 alignas(32) T lanes[
N];
2589 alignas(32) Index index_lanes[
N];
2592 for (
size_t i = 0; i <
N; ++i) {
2593 base[index_lanes[i]] = lanes[i];
2603template <
typename T>
2608 return Vec256<T>{_mm256_i32gather_epi32(
2609 reinterpret_cast<const int32_t*
>(base),
offset.raw, 1)};
2611template <
typename T>
2616 return Vec256<T>{_mm256_i32gather_epi32(
2617 reinterpret_cast<const int32_t*
>(base), index.
raw, 4)};
2620template <
typename T>
2625 return Vec256<T>{_mm256_i64gather_epi64(
2628template <
typename T>
2633 return Vec256<T>{_mm256_i64gather_epi64(
2639template <
typename T,
typename Offset>
2641 const Vec256<Offset>
offset) {
2642 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2645template <
typename T,
typename Index>
2647 const Vec256<Index> index) {
2648 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2680template <
typename T>
2682 return Vec128<T>{_mm256_castsi256_si128(
v.raw)};
2691template <
typename T>
2698template <
typename T>
2700 return Vec128<T>{_mm256_extracti128_si256(
v.raw, 1)};
2710template <
typename T>
2714 alignas(32) T lanes[32 /
sizeof(T)];
2720template <
typename T>
2724 alignas(64) T lanes[64 /
sizeof(T)];
2727 return Load(
d, lanes);
2731template <
typename T>
2749#if !defined(HWY_HAVE_ZEXT)
2750#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \
2751 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
2752 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
2753#define HWY_HAVE_ZEXT 1
2755#define HWY_HAVE_ZEXT 0
2759template <
typename T>
2762return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2764 return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2772 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.
raw, 0)};
2786template <
typename T>
2789 return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2804template <
int kBytes,
typename T>
2806 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2808 return Vec256<T>{_mm256_slli_si256(
v.raw, kBytes)};
2811template <
int kBytes,
typename T>
2818template <
int kLanes,
typename T>
2824template <
int kLanes,
typename T>
2831template <
int kBytes,
typename T>
2833 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2835 return Vec256<T>{_mm256_srli_si256(
v.raw, kBytes)};
2839template <
int kLanes,
typename T>
2848template <
int kBytes,
typename T,
class V = Vec256<T>>
2851 return BitCast(
d, Vec256<uint8_t>{_mm256_alignr_epi8(
2860 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2862 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2866 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2872 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2877 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2884 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2886 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2890 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2896 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2901 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2908 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2913 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2926template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2928 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, 0xB1)};
2937template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2941 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
2945template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2949 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
2953template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2957 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
3021template <
typename T>
3027template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 4)>
3029 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3030#if HWY_IS_DEBUG_BUILD
3033 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
3039template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 8)>
3041 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3042 const Rebind<TI,
decltype(
d)> di;
3044#if HWY_IS_DEBUG_BUILD
3046 AllTrue(di, Lt(idx64,
Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
3049#if HWY_TARGET <= HWY_AVX3
3051 return Indices256<T>{idx64.raw};
3055 const Vec256<TI> dup =
3056 BitCast(di, Vec256<float>{_mm256_moveldup_ps(
BitCast(df, idx64).raw)});
3058 const Vec256<TI> idx32 = dup + dup +
Set(di, TI(1) << 32);
3059 return Indices256<T>{idx32.raw};
3063template <
typename T,
typename TI>
3065 const Rebind<TI,
decltype(
d)> di;
3069template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3071 return Vec256<T>{_mm256_permutevar8x32_epi32(
v.raw, idx.
raw)};
3074template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3076#if HWY_TARGET <= HWY_AVX3
3077 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw,
v.raw)};
3079 return Vec256<T>{_mm256_permutevar8x32_epi32(
v.raw, idx.raw)};
3090#if HWY_TARGET <= HWY_AVX3
3102template <
typename T>
3104 return Vec256<T>{_mm256_permute2x128_si256(
v.raw,
v.raw, 0x01)};
3117template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3119 alignas(32)
constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3123template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3125 alignas(32)
constexpr int64_t kReverse[4] = {3, 2, 1, 0};
3129template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3131#if HWY_TARGET <= HWY_AVX3
3133 alignas(32)
constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
3134 7, 6, 5, 4, 3, 2, 1, 0};
3135 const Vec256<int16_t> idx =
Load(di, kReverse);
3137 _mm256_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
3147template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3153template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3158template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3165template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3167#if HWY_TARGET <= HWY_AVX3
3169 alignas(32)
constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
3170 11, 10, 9, 8, 15, 14, 13, 12};
3171 const Vec256<int16_t> idx =
Load(di, kReverse4);
3173 _mm256_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
3180template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3185template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3193template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3195#if HWY_TARGET <= HWY_AVX3
3197 alignas(32)
constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3198 15, 14, 13, 12, 11, 10, 9, 8};
3199 const Vec256<int16_t> idx =
Load(di, kReverse8);
3201 _mm256_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
3208template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3213template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3317template <
typename T,
class V = Vec256<T>>
3326template <
typename T,
typename TW = MakeW
ide<T>>
3330template <
typename T,
typename TW = MakeW
ide<T>>
3335template <
typename T,
typename TW = MakeW
ide<T>>
3347template <
typename T>
3349 const Vec256<T> lo) {
3350 const Half<
decltype(
d)> d2;
3351 return Vec256<T>{_mm256_inserti128_si256(lo.raw,
LowerHalf(d2, hi).raw, 1)};
3355 const Half<
decltype(
d)> d2;
3361 const Half<
decltype(
d)> d2;
3366template <
typename T>
3368 const Vec256<T> lo) {
3369 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3383template <
typename T>
3385 const Vec256<T> lo) {
3386 return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3400template <
typename T>
3402 const Vec256<T> lo) {
3403 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
3418template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3421#if HWY_TARGET == HWY_AVX3_DL
3422 alignas(32)
constexpr uint8_t kIdx[32] = {
3423 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
3424 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
3427 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
3433 const __m256i u8 = _mm256_packus_epi16(uL.
raw, uH.raw);
3434 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3438template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3441#if HWY_TARGET <= HWY_AVX3
3442 alignas(32)
constexpr uint16_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
3443 17, 19, 21, 23, 25, 27, 29, 31};
3444 return BitCast(
d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi16(
3446 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3452 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3453 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3457template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3460#if HWY_TARGET <= HWY_AVX3
3461 alignas(32)
constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3462 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3463 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3467 const Vec256<float> v3131{_mm256_shuffle_ps(
3468 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3469 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v3131).raw,
3470 _MM_SHUFFLE(3, 1, 2, 0))};
3477#if HWY_TARGET <= HWY_AVX3
3478 alignas(32)
constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3480 __mmask8{0xFF}, hi.
raw)};
3483 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(3, 1, 3, 1))};
3485 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3489template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3492#if HWY_TARGET <= HWY_AVX3
3493 alignas(64)
constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3494 return BitCast(
d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3495 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3499 const Vec256<double> v31{
3500 _mm256_shuffle_pd(
BitCast(df, lo).raw,
BitCast(df, hi).raw, 15)};
3502 _mm256_permute4x64_epi64(
BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3508#if HWY_TARGET <= HWY_AVX3
3510 alignas(64)
constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3512 __mmask8{0xFF}, hi.
raw)};
3517 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3523template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3526#if HWY_TARGET == HWY_AVX3_DL
3527 alignas(64)
constexpr uint8_t kIdx[32] = {
3528 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3529 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3532 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
3539 const __m256i u8 = _mm256_packus_epi16(uL.
raw, uH.
raw);
3540 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3544template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3547#if HWY_TARGET <= HWY_AVX3
3548 alignas(64)
constexpr uint16_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3549 16, 18, 20, 22, 24, 26, 28, 30};
3550 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi16(
3552 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3556 const Vec256<uint32_t> mask =
Set(dw, 0x0000FFFF);
3557 const Vec256<uint32_t> uH =
And(
BitCast(dw, hi), mask);
3558 const Vec256<uint32_t> uL =
And(
BitCast(dw, lo), mask);
3559 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3560 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3564template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3567#if HWY_TARGET <= HWY_AVX3
3568 alignas(64)
constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3569 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3570 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3574 const Vec256<float> v2020{_mm256_shuffle_ps(
3575 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3576 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v2020).raw,
3577 _MM_SHUFFLE(3, 1, 2, 0))};
3585#if HWY_TARGET <= HWY_AVX3
3586 alignas(64)
constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3588 __mmask8{0xFF}, hi.
raw)};
3591 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(2, 0, 2, 0))};
3593 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3598template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3601#if HWY_TARGET <= HWY_AVX3
3602 alignas(64)
constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3603 return BitCast(
d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3604 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3608 const Vec256<double> v20{
3611 _mm256_permute4x64_epi64(
BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3618#if HWY_TARGET <= HWY_AVX3
3620 alignas(64)
constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3622 __mmask8{0xFF}, hi.
raw)};
3627 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3633template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3635 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3639 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3642template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3649template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3651 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3655 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3658template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3667template <
typename T>
3672 alignas(32)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3673 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3676template <
typename T>
3681template <
typename T>
3686template <
typename T>
3694template <
typename T>
3708template <
typename T>
3723template <
typename T>
3731template <
typename T,
typename TI>
3733 const Vec256<TI> from) {
3734 return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3738template <
typename T,
typename TI,
size_t NI>
3740 const Vec128<TI, NI> from) {
3749template <
typename T,
size_t N,
typename TI>
3751 const Vec256<TI> from) {
3763#if HWY_TARGET > HWY_AVX3 && !HWY_IDE
3766template <
typename T>
3768 static_assert(
sizeof(T) == 2,
"Only for 16-bit");
3771 const Rebind<float,
decltype(dw)> df;
3772 const auto zero =
Zero(
d);
3775 const auto upper = exp +
Set(
d, 0x3F80);
3777 const auto f0 =
ZipLower(dw, zero, upper);
3778 const auto f1 =
ZipUpper(dw, zero, upper);
3781 const Vec256<int32_t> bits0{_mm256_cvttps_epi32(
BitCast(df, f0).raw)};
3782 const Vec256<int32_t> bits1{_mm256_cvttps_epi32(
BitCast(df, f1).raw)};
3783 return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3790#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
3807template <
typename T>
3818template <
typename T>
3826#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
3846#if HWY_TARGET <= HWY_AVX3
3858#if HWY_TARGET <= HWY_AVX3
3866 const Vec256<uint64_t> b) {
3869 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3871 const auto b32 =
BitCast(du32, b);
3879 const auto aLbL =
MulEven(a32, b32);
3880 const auto w3 = aLbL & maskL;
3883 const auto w2 = t2 & maskL;
3886 const auto t =
MulEven(a32, bH) + w2;
3889 const auto mulH =
MulEven(aH, bH) + w1 + k;
3895 const Vec256<uint64_t> b) {
3898 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3900 const auto b32 =
BitCast(du32, b);
3906 const auto aLbL =
MulEven(a32, b32);
3907 const auto w3 = aLbL & maskL;
3910 const auto w2 = t2 & maskL;
3913 const auto t =
MulEven(a32, bH) + w2;
3916 const auto mulH =
MulEven(aH, bH) + w1 + k;
4006 const Vec256<int32_t>
v) {
4007 const __m256i u16 = _mm256_packus_epi32(
v.raw,
v.raw);
4010 return Vec128<uint16_t>{
4011 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
4015 const Vec256<int32_t>
v) {
4016 const __m256i i16 = _mm256_packs_epi32(
v.raw,
v.raw);
4017 return Vec128<int16_t>{
4018 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
4022 const Vec256<int32_t>
v) {
4023 const __m256i u16_blocks = _mm256_packus_epi32(
v.raw,
v.raw);
4025 const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
4026 const __m128i u16 = _mm256_castsi256_si128(u16_concat);
4029 const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
4030 return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
4034 const Vec256<int16_t>
v) {
4035 const __m256i u8 = _mm256_packus_epi16(
v.raw,
v.raw);
4036 return Vec128<uint8_t>{
4037 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
4041 const Vec256<int32_t>
v) {
4042 const __m256i i16_blocks = _mm256_packs_epi32(
v.raw,
v.raw);
4044 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
4045 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
4046 return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
4050 const Vec256<int16_t>
v) {
4051 const __m256i i8 = _mm256_packs_epi16(
v.raw,
v.raw);
4052 return Vec128<int8_t>{
4053 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
4062 const Vec256<float>
v) {
4063#ifdef HWY_DISABLE_F16C
4065 const Rebind<uint32_t,
decltype(df16)> du;
4067 const auto bits32 =
BitCast(du,
v);
4070 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
4072 const auto k15 =
Set(di, 15);
4073 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
4074 const auto is_tiny = exp <
Set(di, -24);
4076 const auto is_subnormal = exp <
Set(di, -14);
4077 const auto biased_exp16 =
4079 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
4080 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
4081 (mantissa32 >> (
Set(du, 13) + sub_exp));
4086 const auto normal16 = sign16 |
ShiftLeft<10>(biased_exp16) | mantissa16;
4091 return Vec128<float16_t>{_mm256_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
4098 const Vec256<float>
v) {
4100 const Rebind<int32_t,
decltype(dbf16)> di32;
4101 const Rebind<uint32_t,
decltype(dbf16)> du32;
4102 const Rebind<uint16_t,
decltype(dbf16)> du16;
4108 Vec256<float>
a, Vec256<float> b) {
4111 const Repartition<uint32_t,
decltype(dbf16)> du32;
4117 Vec256<int32_t>
a, Vec256<int32_t> b) {
4118 return Vec256<int16_t>{_mm256_packs_epi32(
a.raw, b.raw)};
4127 const Vec256<double>
v) {
4129 return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
4135 alignas(32)
static constexpr uint32_t k8From32[8] = {
4136 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
4151template <u
int32_t LO, u
int32_t HI,
typename T>
4155#if HWY_TARGET <= HWY_AVX3_DL
4156 alignas(32)
constexpr uint32_t kMap[8] = {
4157 LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
4158 const auto result = _mm256_permutexvar_epi8(
v.raw,
Load(d32, kMap).raw);
4160 alignas(32)
static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u,
4163 const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
4174template <u
int16_t LO, u
int16_t HI,
typename T>
4178#if HWY_TARGET <= HWY_AVX3_DL
4179 alignas(32)
constexpr uint16_t kMap[16] = {
4180 LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
4181 const auto result = _mm256_permutexvar_epi8(
v.raw,
Load(d16, kMap).raw);
4184 constexpr uint16_t ff =
static_cast<uint16_t
>(~0u);
4185 alignas(32)
static constexpr uint16_t kMap[16] = {
4186 LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
4188 const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
4189 const auto half = _mm256_castsi256_si128(mixed);
4199#if HWY_TARGET <= HWY_AVX3_DL
4200 alignas(32)
constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0};
4201 const auto result = _mm256_permutexvar_epi8(
v.raw,
Load(d32, kMap).raw);
4204 alignas(32)
static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
4205 0x0800FFFFu, ~0u, ~0u, ~0u};
4209 const auto result = lo | hi;
4223 alignas(32)
constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
4255#if HWY_TARGET <= HWY_AVX3
4264 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
4268 const auto k52 =
Set(d32, 0x43300000);
4271 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
4272 return (v_upper - k84_63_52) + v_lower;
4278#if HWY_TARGET <= HWY_AVX3
4285 const auto msk_lo =
Set(du32, 0xFFFF);
4286 const auto cnst2_16_flt =
Set(df, 65536.0f);
4298#if HWY_TARGET <= HWY_AVX3
4303 using VU =
VFromD<
decltype(d64)>;
4305 const VU msk_lo =
Set(d64, 0xFFFFFFFFULL);
4306 const auto cnst2_32_dbl =
Set(dd, 4294967296.0);
4309 const VU v_lo =
And(
v, msk_lo);
4315 return BitCast(dd,
w) -
Set(dd, 0x0010000000000000);
4318 const auto v_lo_dbl = uint64_to_double256_fast(v_lo);
4319 return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl);
4329#if HWY_TARGET <= HWY_AVX3
4332 using VI =
decltype(
Zero(di));
4333 const VI k0 =
Zero(di);
4334 const VI k1 =
Set(di, 1);
4335 const VI k51 =
Set(di, 51);
4339 const VI exp = biased_exp -
Set(di, 0x3FF);
4340 const auto in_range = exp <
Set(di, 63);
4348 const VI shift_mnt =
Max(k51 - exp, k0);
4349 const VI shift_int =
Max(exp - k51, k0);
4350 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
4352 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4354 const VI shifted = int52 << shift_int;
4356 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4361 const VI magnitude =
IfThenElse(in_range, restored, limit);
4364 return (magnitude ^ sign_mask) - sign_mask;
4376#ifdef HWY_DISABLE_F16C
4383 const auto mantissa = bits16 &
Set(du32, 0x3FF);
4384 const auto subnormal =
4386 Set(df32, 1.0f / 16384 / 1024));
4388 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
4389 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
4390 const auto normal =
ShiftLeft<23>(biased_exp32) | mantissa32;
4391 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
4401 const Rebind<uint16_t,
decltype(df32)> du16;
4408#if !defined(HWY_DISABLE_PCLMUL_AES)
4411#ifdef HWY_NATIVE_AES
4412#undef HWY_NATIVE_AES
4414#define HWY_NATIVE_AES
4419#if HWY_TARGET == HWY_AVX3_DL
4423 const Half<
decltype(
d)> d2;
4431#if HWY_TARGET == HWY_AVX3_DL
4435 const Half<
decltype(
d)> d2;
4443#if HWY_TARGET == HWY_AVX3_DL
4447 const Half<
decltype(
d)> d2;
4454#if HWY_TARGET == HWY_AVX3_DL
4458 const Half<
decltype(
d)> d2;
4469template <
typename T,
typename T2>
4472 for (
size_t i = 0; i < 32 /
sizeof(T); ++i) {
4476 return Load(
d, lanes);
4479#if HWY_TARGET <= HWY_AVX3
4484template <
typename T>
4487 constexpr size_t N = 32 /
sizeof(T);
4488 constexpr size_t kNumBytes = (
N + 7) / 8;
4490 uint64_t mask_bits = 0;
4494 mask_bits &= (1ull <<
N) - 1;
4503template <
typename T>
4506 constexpr size_t N = 32 /
sizeof(T);
4507 constexpr size_t kNumBytes = (
N + 7) / 8;
4513 const int mask_bits =
static_cast<int>((1ull <<
N) - 1);
4514 bits[0] =
static_cast<uint8_t
>(
bits[0] & mask_bits);
4521template <
typename T>
4523 return PopCount(
static_cast<uint64_t
>(mask.raw));
4526template <
typename T>
4528 const Mask256<T> mask) {
4532template <
typename T>
4542template <
typename T>
4544#if HWY_COMPILER_HAS_MASK_INTRINSICS
4545 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
4547 return mask.
raw == 0;
4550template <
typename T>
4552#if HWY_COMPILER_HAS_MASK_INTRINSICS
4553 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
4555 return mask.
raw == 0;
4558template <
typename T>
4560#if HWY_COMPILER_HAS_MASK_INTRINSICS
4561 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
4563 return mask.
raw == 0;
4566template <
typename T>
4568 return (uint64_t{mask.
raw} & 0xF) == 0;
4573template <
typename T>
4580template <
typename T>
4582#if HWY_COMPILER_HAS_MASK_INTRINSICS
4583 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
4585 return mask.
raw == 0xFFFFFFFFu;
4588template <
typename T>
4590#if HWY_COMPILER_HAS_MASK_INTRINSICS
4591 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
4593 return mask.
raw == 0xFFFFu;
4596template <
typename T>
4598#if HWY_COMPILER_HAS_MASK_INTRINSICS
4599 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
4601 return mask.
raw == 0xFFu;
4604template <
typename T>
4607 return mask.
raw == 0xFu;
4612template <
typename T>
4621template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4623 return Vec256<T>{_mm256_maskz_compress_epi32(mask.
raw,
v.raw)};
4630template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4633 alignas(16)
constexpr uint64_t packed_array[16] = {
4635 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
4636 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
4637 0x00001032, 0x00001320, 0x00000321, 0x00003210};
4643 const auto packed =
Set(du64, packed_array[mask.raw]);
4644 alignas(64)
constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4645 const auto indices = Indices256<T>{(packed >>
Load(du64, shifts)).raw};
4653template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4656 alignas(16)
constexpr uint64_t packed_array[16] = {
4658 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
4659 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
4660 0x00003210, 0x00003201, 0x00003210, 0x00003210};
4666 const auto packed =
Set(du64, packed_array[mask.
raw]);
4667 alignas(32)
constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4676template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4679 _mm256_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
4680 const size_t count =
PopCount(uint64_t{mask.
raw});
4685template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4688 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
4689 const size_t count =
PopCount(uint64_t{mask.raw} & 0xFull);
4697 _mm256_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
4698 const size_t count =
PopCount(uint64_t{mask.
raw});
4706 _mm256_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
4707 const size_t count =
PopCount(uint64_t{mask.
raw} & 0xFull);
4714template <
typename T>
4731template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
4744template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4748 const auto vbits =
BitCast(du,
Set(du32,
static_cast<uint32_t
>(mask_bits)));
4752 alignas(32)
constexpr uint64_t kRep8[4] = {
4753 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4754 0x0303030303030303ull};
4757 alignas(32)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4758 1, 2, 4, 8, 16, 32, 64, 128};
4762template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4765 alignas(32)
constexpr uint16_t kBit[16] = {
4766 1, 2, 4, 8, 16, 32, 64, 128,
4767 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4768 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4772template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4775 alignas(32)
constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4776 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4780template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4783 alignas(32)
constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4790template <
typename T>
4793 constexpr size_t N = 32 /
sizeof(T);
4794 constexpr size_t kNumBytes = (
N + 7) / 8;
4796 uint64_t mask_bits = 0;
4800 mask_bits &= (1ull <<
N) - 1;
4803 return detail::LoadMaskBits256(
d, mask_bits);
4810template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4816 return static_cast<uint32_t
>(_mm256_movemask_epi8(sign_bits));
4819template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4828 return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4833 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4835 const auto compressed =
4836 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4837 return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4841template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4846 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4849template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4854 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4860template <
typename T>
4863 constexpr size_t N = 32 /
sizeof(T);
4864 constexpr size_t kNumBytes = (
N + 7) / 8;
4875template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4882template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4888template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4894template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4896 constexpr uint64_t kAllBits = (1ull << (32 /
sizeof(T))) - 1;
4900template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4906template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4911template <
typename T>
4913 const Mask256<T> mask) {
4918template <
typename T>
4920 const Mask256<T> mask) {
4929template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4937 alignas(16)
constexpr uint32_t packed_array[256] = {
4939 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
4940 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
4941 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
4942 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
4943 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
4944 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
4945 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
4946 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
4947 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
4948 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
4949 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
4950 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
4951 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
4952 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
4953 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
4954 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
4955 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
4956 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
4957 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
4958 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
4959 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
4960 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
4961 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
4962 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
4963 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
4964 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
4965 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
4966 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
4967 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
4968 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
4969 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
4970 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
4971 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
4972 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
4973 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
4974 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
4975 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
4976 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
4977 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
4978 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
4979 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
4980 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
4981 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
4987 const auto packed =
Set(d32, packed_array[mask_bits]);
4988 alignas(32)
constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4989 return packed >>
Load(d32, shifts);
4992template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4999 alignas(32)
constexpr uint32_t u32_indices[128] = {
5001 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7,
5002 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7,
5003 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7,
5004 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7,
5005 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5,
5006 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5,
5007 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3,
5008 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
5009 return Load(d32, u32_indices + 8 * mask_bits);
5012template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
5014 uint64_t mask_bits) {
5021 alignas(16)
constexpr uint32_t packed_array[256] = {
5023 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
5024 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
5025 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
5026 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
5027 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
5028 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
5029 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
5030 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
5031 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
5032 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
5033 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
5034 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
5035 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
5036 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
5037 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
5038 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
5039 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
5040 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
5041 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
5042 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
5043 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
5044 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
5045 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
5046 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
5047 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
5048 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
5049 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
5050 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
5051 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
5052 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
5053 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
5054 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
5055 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
5056 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
5057 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
5058 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
5059 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
5060 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
5061 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
5062 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
5063 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
5064 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
5065 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
5071 const auto packed =
Set(d32, packed_array[mask_bits]);
5072 alignas(32)
constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
5073 return packed >>
Load(d32, shifts);
5076template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
5078 uint64_t mask_bits) {
5084 alignas(32)
constexpr uint32_t u32_indices[128] = {
5086 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9,
5087 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11,
5088 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13,
5089 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13,
5090 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15,
5091 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15,
5092 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15,
5093 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
5094 return Load(d32, u32_indices + 8 * mask_bits);
5096template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
5101 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
5104 const Indices256<uint32_t> indices{IndicesFromBits(
d, mask_bits).raw};
5110template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5115 const Half<
decltype(du)> duh;
5116 const auto half0 =
LowerHalf(duh, vu16);
5117 const auto half1 =
UpperHalf(duh, vu16);
5119 const uint64_t mask_bits0 = mask_bits & 0xFF;
5120 const uint64_t mask_bits1 = mask_bits >> 8;
5121 const auto compressed0 = detail::CompressBits(half0, mask_bits0);
5122 const auto compressed1 = detail::CompressBits(half1, mask_bits1);
5124 alignas(32) uint16_t all_true[16] = {};
5126 const size_t num_true0 =
PopCount(mask_bits0);
5127 Store(compressed0, duh, all_true);
5128 StoreU(compressed1, duh, all_true + num_true0);
5134 alignas(32) uint16_t all_false[16] = {};
5135 const size_t num_true1 =
PopCount(mask_bits1);
5136 Store(compressed1, duh, all_false + 8);
5137 StoreU(compressed0, duh, all_false + num_true1);
5139 const auto mask =
FirstN(du, num_true0 + num_true1);
5148template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
5153 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
5156 const Indices256<uint32_t> indices{IndicesFromNotBits(
d, mask_bits).raw};
5162template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5170template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5175template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5181 Mask256<uint64_t> mask) {
5185template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5187 constexpr size_t N = 32 /
sizeof(T);
5188 constexpr size_t kNumBytes = (
N + 7) / 8;
5190 uint64_t mask_bits = 0;
5194 mask_bits &= (1ull <<
N) - 1;
5202template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5206 const size_t count =
PopCount(mask_bits);
5212template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
5216 const size_t count =
PopCount(mask_bits);
5219 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
5222 const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(
d, mask_bits);
5226 const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
5228 const Vec256<T> compressed =
5230 Indices256<uint32_t>{idx_and_mask.raw}));
5237template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5241 const size_t count =
PopCount(mask_bits);
5244#if HWY_MEM_OPS_MIGHT_FAULT
5247 alignas(32) T
buf[16];
5249 memcpy(unaligned,
buf, count *
sizeof(T));
5256template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5259 constexpr size_t N = 32 /
sizeof(T);
5260 constexpr size_t kNumBytes = (
N + 7) / 8;
5262 uint64_t mask_bits = 0;
5266 mask_bits &= (1ull <<
N) - 1;
5268 const size_t count =
PopCount(mask_bits);
5291template <
typename T>
5294 Vec256<T>& A, Vec256<T>& B, Vec256<T>&
C) {
5295 constexpr size_t N = 32 /
sizeof(T);
5296 const Vec256<T> v10 =
LoadU(
d, unaligned + 0 *
N);
5297 const Vec256<T> v32 =
LoadU(
d, unaligned + 1 *
N);
5298 const Vec256<T> v54 =
LoadU(
d, unaligned + 2 *
N);
5315template <
typename T>
5318 Vec256<T>& A, Vec256<T>& B, Vec256<T>&
C,
5320 constexpr size_t N = 32 /
sizeof(T);
5321 const Vec256<T> v10 =
LoadU(
d, unaligned + 0 *
N);
5322 const Vec256<T> v32 =
LoadU(
d, unaligned + 1 *
N);
5323 const Vec256<T> v54 =
LoadU(
d, unaligned + 2 *
N);
5324 const Vec256<T> v76 =
LoadU(
d, unaligned + 3 *
N);
5346template <
typename T>
5350 constexpr size_t N = 32 /
sizeof(T);
5353 StoreU(out0,
d, unaligned + 0 *
N);
5354 StoreU(out1,
d, unaligned + 1 *
N);
5365template <
typename T>
5369 constexpr size_t N = 32 /
sizeof(T);
5373 StoreU(out0,
d, unaligned + 0 *
N);
5374 StoreU(out1,
d, unaligned + 1 *
N);
5375 StoreU(out2,
d, unaligned + 2 *
N);
5388template <
typename T>
5390 const Vec256<T> k,
const Vec256<T> l,
5392 constexpr size_t N = 32 /
sizeof(T);
5396 StoreU(out0,
d, unaligned + 0 *
N);
5397 StoreU(out1,
d, unaligned + 1 *
N);
5400 StoreU(out2,
d, unaligned + 2 *
N);
5401 StoreU(out3,
d, unaligned + 3 *
N);
5412template <
typename T>
5416 const auto v31_20_31_20 = v3210 + v1032;
5417 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5418 return v20_31_20_31 + v31_20_31_20;
5420template <
typename T>
5424 const auto v31_20_31_20 =
Min(v3210, v1032);
5425 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5426 return Min(v20_31_20_31, v31_20_31_20);
5428template <
typename T>
5432 const auto v31_20_31_20 =
Max(v3210, v1032);
5433 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5434 return Max(v20_31_20_31, v31_20_31_20);
5437template <
typename T>
5443template <
typename T>
5447 return Min(v10, v01);
5449template <
typename T>
5453 return Max(v10, v01);
5525template <
typename T>
5530template <
typename T>
5535template <
typename T>
uint8_t buf
Definition BitIO.h:84
size_t offset
Definition BitIO.h:80
uint32_t x
Definition BlockExec.h:38
uint8_t * bits
Definition TileProcessor.h:59
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_MAYBE_UNUSED
Definition base.h:82
#define HWY_ASSERT(condition)
Definition base.h:192
Definition x86_128-inl.h:70
Raw raw
Definition arm_neon-inl.h:814
Definition x86_256-inl.h:82
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition x86_256-inl.h:109
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition x86_256-inl.h:103
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition x86_256-inl.h:100
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition x86_256-inl.h:97
Raw raw
Definition x86_256-inl.h:113
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition x86_256-inl.h:106
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition x86_256-inl.h:94
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
typename detail::Raw256< T >::type Raw
Definition x86_256-inl.h:83
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition x86_256-inl.h:91
T PrivateT
Definition wasm_256-inl.h:29
#define HWY_AVX3_DL
Definition detect_targets.h:65
#define HWY_TARGET
Definition detect_targets.h:380
uint32_t a
only used by MQ decoder
Definition mqc.h:48
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:726
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:718
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition x86_128-inl.h:5571
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:1570
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:470
HWY_INLINE Vec128< uint32_t, 2 > LookupAndConcatQuarters(Vec256< T > v)
Definition x86_256-inl.h:4175
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:963
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:3661
HWY_INLINE Vec128< uint32_t > LookupAndConcatHalves(Vec256< T > v)
Definition x86_256-inl.h:4152
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition x86_256-inl.h:2612
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition x86_128-inl.h:5009
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition x86_128-inl.h:3286
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition x86_128-inl.h:3275
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:862
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:663
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:4543
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6162
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:671
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:760
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition generic_ops-inl.h:340
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
static bool SignBit(float f)
Definition scalar-inl.h:601
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
Simd< T, 32/sizeof(T), 0 > Full256
Definition wasm_128-inl.h:54
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
Simd< T, 8/sizeof(T), 0 > Full64
Definition ops/shared-inl.h:240
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition ops/shared-inl.h:271
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
long long int GatherIndex64
Definition x86_128-inl.h:3268
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
constexpr MakeSigned< T > MaxExponentTimes2()
Definition base.h:728
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
HWY_API constexpr bool IsSigned()
Definition base.h:642
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:627
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed+R::is_float)<< 8)>
Definition base.h:619
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
constexpr int MantissaBits()
Definition base.h:712
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:796
constexpr MakeSigned< T > MaxExponentField()
Definition base.h:778
HWY_API constexpr bool IsFloat()
Definition base.h:635
HWY_API constexpr T LimitsMax()
Definition base.h:656
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
#define HWY_ATTR
Definition set_macros-inl.h:443
Definition x86_128-inl.h:6137
Definition x86_256-inl.h:3022
__m256i raw
Definition x86_256-inl.h:3023
Definition x86_256-inl.h:143
typename detail::RawMask256< sizeof(T)>::type Raw
Definition x86_256-inl.h:144
static Mask256< T > FromBits(uint64_t mask_bits)
Definition x86_256-inl.h:146
Raw raw
Definition x86_256-inl.h:150
Definition ops/shared-inl.h:52
HWY_INLINE __m256d operator()(__m256i v)
Definition x86_256-inl.h:192
HWY_INLINE __m256 operator()(__m256i v)
Definition x86_256-inl.h:188
Definition x86_256-inl.h:183
HWY_INLINE __m256i operator()(__m256i v)
Definition x86_256-inl.h:184
__m256d type
Definition x86_256-inl.h:76
__m256 type
Definition x86_256-inl.h:72
Definition x86_256-inl.h:67
__m256i type
Definition x86_256-inl.h:68
__mmask32 type
Definition x86_256-inl.h:125
__mmask16 type
Definition x86_256-inl.h:129
__mmask8 type
Definition x86_256-inl.h:133
__mmask8 type
Definition x86_256-inl.h:137
Definition x86_256-inl.h:122
uint32_t x1
Definition t1_common.h:75