28#if HWY_COMPILER_GCC_ACTUAL
35#if HWY_COMPILER_CLANGCL
43#include <avx2intrin.h>
44#include <f16cintrin.h>
47#include <avx512fintrin.h>
48#include <avx512vlintrin.h>
49#include <avx512bwintrin.h>
50#include <avx512dqintrin.h>
51#include <avx512vlbwintrin.h>
52#include <avx512vldqintrin.h>
53#include <avx512bitalgintrin.h>
54#include <avx512vlbitalgintrin.h>
55#include <avx512vpopcntdqintrin.h>
56#include <avx512vpopcntdqvlintrin.h>
64#include <sanitizer/msan_interface.h>
122 return *
this = (*
this * other);
125 return *
this = (*
this / other);
128 return *
this = (*
this + other);
131 return *
this = (*
this - other);
134 return *
this = (*
this & other);
137 return *
this = (*
this | other);
140 return *
this = (*
this ^ other);
163 return _mm512_castpd_si512(
v);
192template <
typename T,
typename FromT>
202 return Vec512<T>{_mm512_setzero_si512()};
223 _mm512_set1_epi64(
static_cast<long long>(t))};
236 _mm512_set1_epi64(
static_cast<long long>(t))};
253 return Vec512<T>{_mm512_undefined_epi32()};
273 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
339 using VU =
VFromD<
decltype(du)>;
340 const __m512i ret = _mm512_ternarylogic_epi64(
350 using VU =
VFromD<
decltype(du)>;
351 const __m512i ret = _mm512_ternarylogic_epi64(
361 using VU =
VFromD<
decltype(du)>;
362 const __m512i ret = _mm512_ternarylogic_epi64(
372 using VU =
VFromD<
decltype(du)>;
398#if HWY_TARGET == HWY_AVX3_DL
400#ifdef HWY_NATIVE_POPCNT
401#undef HWY_NATIVE_POPCNT
403#define HWY_NATIVE_POPCNT
440 static_assert(
IsFloat<T>(),
"Only makes sense for floating-point");
456 const __m512i out = _mm512_ternarylogic_epi32(
482template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
485 const uint32_t all = ~uint32_t{0};
487 m.raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u32(all, n));
491template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
493 const uint64_t
bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
494 return Mask512<T>{
static_cast<__mmask64
>(
bits)};
504 const uint64_t all = ~uint64_t{0};
506 m.
raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u64(all, n));
509 return detail::FirstN<T>(n);
646 static_assert(
IsSigned<T>(),
"Only works for signed/float");
651template <
typename T, HWY_IF_FLOAT(T)>
881template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
888 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
913 return shifted &
Set(d8, 0xFF >> kBits);
936 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
937 return (shifted ^ shifted_sign) - shifted_sign;
944 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
950 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
981template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
986 return shifted &
Set(d8,
static_cast<T
>((0xFF <<
bits) & 0xFF));
1008 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >>
bits));
1029 const auto shifted_sign =
1031 return (shifted ^ shifted_sign) - shifted_sign;
1052template <
typename T, HWY_IF_SIGNED(T)>
1193#ifdef HWY_NATIVE_I64MULLO
1194#undef HWY_NATIVE_I64MULLO
1196#define HWY_NATIVE_I64MULLO
1238template <
typename T, HWY_IF_FLOAT(T)>
1243template <
typename T, HWY_IF_NOT_FLOAT(T)>
1344 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1348 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1354 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1358 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1364 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1368 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1374 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1378 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1387template <
typename TFrom,
typename TTo>
1389 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1395template <
typename T>
1400template <
typename T>
1405template <
typename T>
1410template <
typename T>
1418template <
typename T>
1426template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1430template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1432 return Mask512<T>{_mm512_cmpeq_epi16_mask(
a.raw, b.raw)};
1434template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1436 return Mask512<T>{_mm512_cmpeq_epi32_mask(
a.raw, b.raw)};
1438template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1440 return Mask512<T>{_mm512_cmpeq_epi64_mask(
a.raw, b.raw)};
1453template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1457template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1459 return Mask512<T>{_mm512_cmpneq_epi16_mask(
a.raw, b.raw)};
1461template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1463 return Mask512<T>{_mm512_cmpneq_epi32_mask(
a.raw, b.raw)};
1465template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1467 return Mask512<T>{_mm512_cmpneq_epi64_mask(
a.raw, b.raw)};
1524template <
typename T>
1529template <
typename T>
1538template <
typename T>
1542template <
typename T>
1546template <
typename T>
1550template <
typename T>
1557template <
typename T>
1590 return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(
v.raw))};
1603template <
typename T>
1612template <
typename T>
1614#if HWY_COMPILER_HAS_MASK_INTRINSICS
1620template <
typename T>
1622#if HWY_COMPILER_HAS_MASK_INTRINSICS
1628template <
typename T>
1630#if HWY_COMPILER_HAS_MASK_INTRINSICS
1633 return Mask512<T>{
static_cast<uint16_t
>(~m.raw & 0xFFFF)};
1636template <
typename T>
1638#if HWY_COMPILER_HAS_MASK_INTRINSICS
1641 return Mask512<T>{
static_cast<uint8_t
>(~m.raw & 0xFF)};
1645template <
typename T>
1648#if HWY_COMPILER_HAS_MASK_INTRINSICS
1654template <
typename T>
1657#if HWY_COMPILER_HAS_MASK_INTRINSICS
1663template <
typename T>
1666#if HWY_COMPILER_HAS_MASK_INTRINSICS
1672template <
typename T>
1675#if HWY_COMPILER_HAS_MASK_INTRINSICS
1682template <
typename T>
1685#if HWY_COMPILER_HAS_MASK_INTRINSICS
1691template <
typename T>
1694#if HWY_COMPILER_HAS_MASK_INTRINSICS
1700template <
typename T>
1703#if HWY_COMPILER_HAS_MASK_INTRINSICS
1709template <
typename T>
1712#if HWY_COMPILER_HAS_MASK_INTRINSICS
1719template <
typename T>
1722#if HWY_COMPILER_HAS_MASK_INTRINSICS
1728template <
typename T>
1731#if HWY_COMPILER_HAS_MASK_INTRINSICS
1737template <
typename T>
1740#if HWY_COMPILER_HAS_MASK_INTRINSICS
1746template <
typename T>
1749#if HWY_COMPILER_HAS_MASK_INTRINSICS
1756template <
typename T>
1759#if HWY_COMPILER_HAS_MASK_INTRINSICS
1765template <
typename T>
1768#if HWY_COMPILER_HAS_MASK_INTRINSICS
1774template <
typename T>
1777#if HWY_COMPILER_HAS_MASK_INTRINSICS
1783template <
typename T>
1786#if HWY_COMPILER_HAS_MASK_INTRINSICS
1793template <
typename T>
1796#if HWY_COMPILER_HAS_MASK_INTRINSICS
1802template <
typename T>
1805#if HWY_COMPILER_HAS_MASK_INTRINSICS
1808 return Mask512<T>{
static_cast<__mmask32
>(~(
a.raw ^ b.
raw) & 0xFFFFFFFF)};
1811template <
typename T>
1814#if HWY_COMPILER_HAS_MASK_INTRINSICS
1817 return Mask512<T>{
static_cast<__mmask16
>(~(
a.raw ^ b.
raw) & 0xFFFF)};
1820template <
typename T>
1823#if HWY_COMPILER_HAS_MASK_INTRINSICS
1826 return Mask512<T>{
static_cast<__mmask8
>(~(
a.raw ^ b.
raw) & 0xFF)};
1832template <
typename T>
1837template <
typename T>
1842template <
typename T>
1847template <
typename T>
1852template <
typename T>
1857template <
typename T>
1909template <
typename T>
1911 return Vec512<T>{_mm512_load_si512(aligned)};
1922template <
typename T>
1924 return Vec512<T>{_mm512_loadu_si512(p)};
1937template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1943template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1946 return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1949template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1952 return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1955template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1958 return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1975template <
typename T>
1979 return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1983 const __m128 x4 = _mm_loadu_ps(p);
1989 const __m128d x2 = _mm_loadu_pd(p);
1995template <
typename T>
1998 _mm512_store_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
2002 _mm512_store_ps(aligned,
v.raw);
2006 _mm512_store_pd(aligned,
v.raw);
2009template <
typename T>
2012 _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(p),
v.raw);
2016 _mm512_storeu_ps(p,
v.raw);
2020 _mm512_storeu_pd(p,
v.raw);
2025template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2028 _mm512_mask_storeu_epi8(p, m.
raw,
v.raw);
2031template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2034 _mm512_mask_storeu_epi16(p, m.raw,
v.raw);
2037template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2040 _mm512_mask_storeu_epi32(p, m.raw,
v.raw);
2043template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2046 _mm512_mask_storeu_epi64(p, m.raw,
v.raw);
2051 _mm512_mask_storeu_ps(p, m.
raw,
v.raw);
2056 _mm512_mask_storeu_pd(p, m.
raw,
v.raw);
2061template <
typename T>
2064 _mm512_stream_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
2068 _mm512_stream_ps(aligned,
v.raw);
2072 _mm512_stream_pd(aligned,
v.raw);
2083template <
typename T>
2087 _mm512_i32scatter_epi32(base,
offset.raw,
v.raw, 1);
2089template <
typename T>
2093 _mm512_i32scatter_epi32(base, index.
raw,
v.raw, 4);
2096template <
typename T>
2100 _mm512_i64scatter_epi64(base,
offset.raw,
v.raw, 1);
2102template <
typename T>
2106 _mm512_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2111template <
typename T,
typename Offset>
2114 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2117template <
typename T,
typename Index>
2120 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2127 _mm512_i32scatter_ps(base,
offset.raw,
v.raw, 1);
2132 _mm512_i32scatter_ps(base, index.
raw,
v.raw, 4);
2138 _mm512_i64scatter_pd(base,
offset.raw,
v.raw, 1);
2143 _mm512_i64scatter_pd(base, index.
raw,
v.raw, 8);
2150template <
typename T>
2157template <
typename T>
2162 return Vec512<T>{_mm512_i32gather_epi32(index.
raw, base, 4)};
2165template <
typename T>
2172template <
typename T>
2177 return Vec512<T>{_mm512_i64gather_epi64(index.
raw, base, 8)};
2182template <
typename T,
typename Offset>
2185 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2188template <
typename T,
typename Index>
2191 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2223template <
typename T>
2225 return Vec256<T>{_mm512_castsi512_si256(
v.raw)};
2234template <
typename T>
2241template <
typename T>
2243 return Vec256<T>{_mm512_extracti32x8_epi32(
v.raw, 1)};
2253template <
typename T>
2257 alignas(64) T lanes[64 /
sizeof(T)];
2263template <
typename T>
2267 alignas(64) T lanes[64 /
sizeof(T)];
2270 return Load(
d, lanes);
2274template <
typename T>
2281template <
typename T>
2286 return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.
raw, 0)};
2308template <
typename T>
2311 return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.
raw, 1)};
2326template <
int kBytes,
typename T>
2328 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2329 return Vec512<T>{_mm512_bslli_epi128(
v.raw, kBytes)};
2332template <
int kBytes,
typename T>
2339template <
int kLanes,
typename T>
2345template <
int kLanes,
typename T>
2351template <
int kBytes,
typename T>
2353 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2354 return Vec512<T>{_mm512_bsrli_epi128(
v.raw, kBytes)};
2358template <
int kLanes,
typename T>
2366template <
int kBytes,
typename T,
class V = Vec512<T>>
2378 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2380 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2384 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2390 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2391 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2396 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2397 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2404 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2406 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2410 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2416 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2417 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2422 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2423 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2430 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2431 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2436 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2437 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0xFF * kLane);
2450template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2452 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_CDAB)};
2460template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2468template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2476template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2544template <
typename T>
2549template <
typename T,
typename TI>
2551 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2552#if HWY_IS_DEBUG_BUILD
2555 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(64 /
sizeof(T))))));
2560template <
typename T,
typename TI>
2562 const Rebind<TI,
decltype(
d)> di;
2566template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2571template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2573 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw,
v.raw)};
2587template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2590 alignas(64)
constexpr int16_t kReverse[32] = {
2591 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2592 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2595 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2598template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2600 alignas(64)
constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2601 7, 6, 5, 4, 3, 2, 1, 0};
2605template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2607 alignas(64)
constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2613template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2619template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2624template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2631template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2634 alignas(64)
constexpr int16_t kReverse4[32] = {
2635 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2636 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2639 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2642template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2647template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2649 return Vec512<T>{_mm512_permutex_epi64(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2652 return Vec512<double>{_mm512_permutex_pd(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2657template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2660 alignas(64)
constexpr int16_t kReverse8[32] = {
2661 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2662 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2665 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2668template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2671 alignas(64)
constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2672 15, 14, 13, 12, 11, 10, 9, 8};
2673 const Vec512<int32_t> idx =
Load(di, kReverse8);
2675 _mm512_permutexvar_epi32(idx.raw,
BitCast(di,
v).raw)});
2678template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2782template <
typename T,
class V = Vec512<T>>
2791template <
typename T,
typename TW = MakeW
ide<T>>
2795template <
typename T,
typename TW = MakeW
ide<T>>
2800template <
typename T,
typename TW = MakeW
ide<T>>
2808template <
typename T>
2811 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BABA)};
2825template <
typename T>
2828 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_DCDC)};
2842template <
typename T>
2845 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BADC)};
2859template <
typename T>
2864 const __mmask32 mask = (0x0000FFFF);
2870 const __mmask16 mask = (0x00FF);
2876 const __mmask8 mask = (0x0F);
2882template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2885#if HWY_TARGET == HWY_AVX3_DL
2886 alignas(64)
constexpr uint8_t kIdx[64] = {
2887 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
2888 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
2889 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
2890 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
2891 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
2895 __mmask64{0xFFFFFFFFFFFFFFFFull},
BitCast(du, hi).raw)});
2904 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2909template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2912 alignas(64)
constexpr uint16_t kIdx[32] = {
2913 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2914 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
2915 return BitCast(
d, Vec512<uint16_t>{_mm512_mask2_permutex2var_epi16(
2917 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
2920template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2923 alignas(64)
constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2924 17, 19, 21, 23, 25, 27, 29, 31};
2925 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2927 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2933 alignas(64)
constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2934 17, 19, 21, 23, 25, 27, 29, 31};
2936 __mmask16{0xFFFF}, hi.
raw)};
2939template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2942 alignas(64)
constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2943 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2944 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2951 alignas(64)
constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2953 __mmask8{0xFF}, hi.
raw)};
2958template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2961#if HWY_TARGET == HWY_AVX3_DL
2962 alignas(64)
constexpr uint8_t kIdx[64] = {
2963 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
2964 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
2965 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
2966 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
2967 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
2971 __mmask64{0xFFFFFFFFFFFFFFFFull},
BitCast(du, hi).raw)});
2981 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2986template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2989 alignas(64)
constexpr uint16_t kIdx[32] = {
2990 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
2991 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
2992 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi16(
2994 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
2997template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3000 alignas(64)
constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3001 16, 18, 20, 22, 24, 26, 28, 30};
3002 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
3004 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3010 alignas(64)
constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3011 16, 18, 20, 22, 24, 26, 28, 30};
3013 __mmask16{0xFFFF}, hi.
raw)};
3016template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3019 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3020 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
3021 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3028 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3030 __mmask8{0xFF}, hi.
raw)};
3035template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3037 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_CCAA)};
3043template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3050template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3052 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_DDBB)};
3058template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3065template <
typename T>
3067 constexpr size_t s =
sizeof(T);
3068 constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
3074template <
typename T>
3076 return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.
raw, even.
raw)};
3081 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.
raw, even.
raw)};
3086 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.
raw, even.
raw)};
3091template <
typename T>
3093 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
3097 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
3106template <
typename T>
3108 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
3111 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
3121template <
typename T,
typename TI>
3127template <
typename T,
typename TI,
size_t NI>
3130 const Half<
decltype(d512)> d256;
3131 const Half<
decltype(d256)> d128;
3134 const auto from_512 =
3140template <
typename T,
typename TI>
3147template <
typename T,
size_t N,
typename TI>
3150 const Half<
decltype(d512)> d256;
3151 const Half<
decltype(d256)> d128;
3154 const auto bytes_512 =
3158template <
typename T,
typename TI>
3231 const Rebind<uint16_t,
decltype(df32)> du16;
3251 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3262 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3264 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3274 _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3277 alignas(16)
static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3279 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3288 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3290 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3299 alignas(16)
static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3300 0, 4, 8, 12, 0, 4, 8, 12};
3302 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3311 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3313 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3329 const Rebind<int32_t,
decltype(dbf16)> di32;
3330 const Rebind<uint32_t,
decltype(dbf16)> du32;
3331 const Rebind<uint16_t,
decltype(dbf16)> du16;
3340 const Repartition<uint32_t,
decltype(dbf16)> du32;
3366 alignas(16)
static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3370 alignas(16)
static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3372 _mm512_permutexvar_epi32(
LoadDup128(d32, kIndex32).raw, quads.raw)};
3380#if HWY_TARGET == HWY_AVX3_DL
3383 alignas(16)
static constexpr uint8_t k8From64[16] = {
3384 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
3386 _mm512_permutexvar_epi8(
LoadDup128(d8, k8From64).raw,
v.raw)};
3390 alignas(64)
constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3391 0, 2, 4, 6, 8, 10, 12, 14};
3393 _mm512_permutexvar_epi32(
Load(d32, kEven).raw,
v.raw)};
3401 alignas(16)
static constexpr uint16_t k16From64[8] = {
3402 0, 4, 8, 12, 16, 20, 24, 28};
3404 _mm512_permutexvar_epi16(
LoadDup128(d16, k16From64).raw,
v.raw)};
3411 alignas(64)
constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3412 0, 2, 4, 6, 8, 10, 12, 14};
3414 _mm512_permutexvar_epi32(
Load(d32, kEven).raw,
v.raw)};
3420#if HWY_TARGET == HWY_AVX3_DL
3422 alignas(16)
static constexpr uint8_t k8From32[16] = {
3423 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
3425 _mm512_permutexvar_epi32(
LoadDup128(d8, k8From32).raw,
v.raw)};
3430 alignas(16)
static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3434 alignas(16)
static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3436 _mm512_permutexvar_epi32(
LoadDup128(d32, kIndex32).raw, quads.raw)};
3444 alignas(64)
static constexpr uint16_t k16From32[32] = {
3445 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3446 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
3448 _mm512_permutexvar_epi16(
Load(d16, k16From32).raw,
v.raw)};
3454#if HWY_TARGET == HWY_AVX3_DL
3456 alignas(64)
static constexpr uint8_t k8From16[64] = {
3457 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3458 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
3459 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3460 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3462 _mm512_permutexvar_epi8(
Load(d8, k8From16).raw,
v.raw)};
3465 alignas(16)
static constexpr uint32_t k16From32[4] = {
3466 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
3468 alignas(64)
static constexpr uint32_t kIndex32[16] = {
3469 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
3471 _mm512_permutexvar_epi32(
Load(d32, kIndex32).raw, quads.raw)};
3513#if !defined(HWY_DISABLE_PCLMUL_AES)
3516#ifdef HWY_NATIVE_AES
3517#undef HWY_NATIVE_AES
3519#define HWY_NATIVE_AES
3524#if HWY_TARGET == HWY_AVX3_DL
3528 const Half<
decltype(
d)> d2;
3536#if HWY_TARGET == HWY_AVX3_DL
3540 const Half<
decltype(
d)> d2;
3548#if HWY_TARGET == HWY_AVX3_DL
3551 alignas(64) uint64_t
a[8];
3552 alignas(64) uint64_t b[8];
3557 for (
size_t i = 0; i < 8; i += 2) {
3566#if HWY_TARGET == HWY_AVX3_DL
3569 alignas(64) uint64_t
a[8];
3570 alignas(64) uint64_t b[8];
3575 for (
size_t i = 0; i < 8; i += 2) {
3588template <
typename T,
typename T2>
3591 for (
size_t i = 0; i < 64 /
sizeof(T); ++i) {
3595 return Load(
d, lanes);
3604template <
typename T>
3606#if HWY_COMPILER_HAS_MASK_INTRINSICS
3607 return _kortestz_mask64_u8(mask.
raw, mask.
raw);
3609 return mask.
raw == 0;
3612template <
typename T>
3614#if HWY_COMPILER_HAS_MASK_INTRINSICS
3615 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
3617 return mask.
raw == 0;
3620template <
typename T>
3622#if HWY_COMPILER_HAS_MASK_INTRINSICS
3623 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
3625 return mask.
raw == 0;
3628template <
typename T>
3630#if HWY_COMPILER_HAS_MASK_INTRINSICS
3631 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
3633 return mask.
raw == 0;
3639template <
typename T>
3646template <
typename T>
3648#if HWY_COMPILER_HAS_MASK_INTRINSICS
3649 return _kortestc_mask64_u8(mask.
raw, mask.
raw);
3651 return mask.
raw == 0xFFFFFFFFFFFFFFFFull;
3654template <
typename T>
3656#if HWY_COMPILER_HAS_MASK_INTRINSICS
3657 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
3659 return mask.
raw == 0xFFFFFFFFull;
3662template <
typename T>
3664#if HWY_COMPILER_HAS_MASK_INTRINSICS
3665 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
3667 return mask.
raw == 0xFFFFull;
3670template <
typename T>
3672#if HWY_COMPILER_HAS_MASK_INTRINSICS
3673 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
3675 return mask.
raw == 0xFFull;
3681template <
typename T>
3687template <
typename T>
3697template <
typename T>
3700 const size_t kNumBytes = 8 /
sizeof(T);
3706template <
typename T>
3711template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3717template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3719 const Mask512<T> mask) {
3723template <
typename T>
3733#ifdef HWY_NATIVE_COMPRESS8
3734#undef HWY_NATIVE_COMPRESS8
3736#define HWY_NATIVE_COMPRESS8
3741#if HWY_TARGET == HWY_AVX3_DL
3775 _mm_mask_compressstoreu_epi8(unaligned, mask.
raw,
v.raw);
3780 _mm256_mask_compressstoreu_epi8(unaligned, mask.
raw,
v.raw);
3785 _mm512_mask_compressstoreu_epi8(unaligned, mask.
raw,
v.raw);
3793 _mm_mask_compressstoreu_epi16(unaligned, mask.
raw,
v.raw);
3798 _mm256_mask_compressstoreu_epi16(unaligned, mask.
raw,
v.raw);
3803 _mm512_mask_compressstoreu_epi16(unaligned, mask.
raw,
v.raw);
3828 _mm_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
3833 _mm256_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
3838 _mm512_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
3846 _mm_mask_compressstoreu_epi64(unaligned, mask.
raw,
v.raw);
3851 _mm256_mask_compressstoreu_epi64(unaligned, mask.
raw,
v.raw);
3856 _mm512_mask_compressstoreu_epi64(unaligned, mask.
raw,
v.raw);
3866 const Rebind<uint32_t,
decltype(
d)> d32;
3869 const uint64_t mask_bits{mask.
raw};
3871 using M32 =
MFromD<
decltype(d32)>;
3872 const M32 m0{
static_cast<typename M32::Raw
>(mask_bits)};
3880 const Rebind<int32_t,
decltype(
d)> di32;
3882 const MFromD<
decltype(du32)> mask32{
static_cast<__mmask8
>(mask.
raw)};
3892 const Rebind<int32_t,
decltype(
d)> di32;
3900template <
typename T,
size_t N>
3918 const uint64_t mask_bits{mask.
raw};
3919 const Half<
decltype(
d)> dh;
3920 const Rebind<uint32_t,
decltype(dh)> d32;
3935 const uint64_t mask_bits{mask.
raw};
3937 const Rebind<uint32_t,
decltype(dq)> d32;
3946 static_cast<uint16_t
>((mask_bits >> 16) & 0xFFFFu)};
3948 static_cast<uint16_t
>((mask_bits >> 32) & 0xFFFFu)};
3969 const Half<
decltype(
d)> dh;
3975 const uint64_t mask_bits{mask.
raw};
3976 const uint64_t maskL = mask_bits & 0xFFFF;
3977 const uint64_t maskH = mask_bits >> 16;
3987 StoreU(demoted0, dh, unaligned);
3992template <
typename T>
4010template <
class V,
class M, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x6)>
4015#if HWY_TARGET == HWY_AVX3_DL
4022template <
class V,
class M, HWY_IF_LANE_SIZE_V(V, 4)>
4030template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4033 alignas(16)
constexpr uint64_t packed_array[256] = {
4037 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4038 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4039 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4040 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4041 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4042 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4043 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4044 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4045 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4046 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4047 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4048 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4049 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4050 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4051 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4052 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4053 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4054 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4055 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4056 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4057 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4058 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4059 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4060 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4061 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4062 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4063 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4064 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4065 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4066 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4067 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4068 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4069 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4070 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4071 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4072 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4073 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4074 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4075 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4076 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4077 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4078 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4079 0x10765432, 0x17654320, 0x07654321, 0x76543210};
4085 const auto packed =
Set(du64, packed_array[mask.
raw]);
4086 alignas(64)
constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4093template <
class V,
class M, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
4098template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4101 alignas(16)
constexpr uint64_t packed_array[256] = {
4105 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
4106 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
4107 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
4108 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
4109 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
4110 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
4111 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
4112 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
4113 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
4114 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
4115 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
4116 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
4117 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
4118 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
4119 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
4120 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
4121 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
4122 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
4123 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
4124 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
4125 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
4126 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
4127 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
4128 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
4129 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
4130 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
4131 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
4132 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
4133 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
4134 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
4135 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
4136 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
4137 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
4138 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
4139 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
4140 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
4141 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
4142 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
4143 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
4144 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
4145 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
4146 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
4147 0x76543210, 0x76543201, 0x76543210, 0x76543210};
4153 const auto packed =
Set(du64, packed_array[mask.
raw]);
4154 alignas(64)
constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4161template <
class V,
class M, hwy::EnableIf<(sizeof(V) > 16)>* =
nullptr>
4174template <
class V,
class D, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x6)>
4180#if HWY_TARGET == HWY_AVX3_DL
4190template <
class V,
class D, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x110)>
4195 using TU =
TFromD<
decltype(du)>;
4196 TU*
HWY_RESTRICT pu =
reinterpret_cast<TU*
>(unaligned);
4207 _mm512_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
4208 const size_t count =
PopCount(uint64_t{mask.
raw});
4216 _mm512_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
4217 const size_t count =
PopCount(uint64_t{mask.
raw});
4223template <
class D,
typename T = TFromD<D>>
4251template <_MM_PERM_ENUM kPerm,
typename T>
4255template <_MM_PERM_ENUM kPerm>
4259template <_MM_PERM_ENUM kPerm>
4272template <
typename T>
4276 constexpr size_t N = 64 /
sizeof(T);
4299template <
typename T>
4304 constexpr size_t N = 64 /
sizeof(T);
4334template <
typename T>
4338 constexpr size_t N = 64 /
sizeof(T);
4341 const auto j1_i1_j0_i0 =
4343 const auto j3_i3_j2_i2 =
4345 StoreU(j1_i1_j0_i0,
d, unaligned + 0 *
N);
4346 StoreU(j3_i3_j2_i2,
d, unaligned + 1 *
N);
4357template <
typename T>
4361 constexpr size_t N = 64 /
sizeof(T);
4373 StoreU(out0,
d, unaligned + 0 *
N);
4374 StoreU(out1,
d, unaligned + 1 *
N);
4375 StoreU(out2,
d, unaligned + 2 *
N);
4388template <
typename T>
4392 constexpr size_t N = 64 /
sizeof(T);
4405 StoreU(out0,
d, unaligned + 0 *
N);
4406 StoreU(out1,
d, unaligned + 1 *
N);
4407 StoreU(out2,
d, unaligned + 2 *
N);
4408 StoreU(out3,
d, unaligned + 3 *
N);
4419 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
4421 const auto b32 =
BitCast(du32, b);
4429 const auto aLbL =
MulEven(a32, b32);
4430 const auto w3 = aLbL & maskL;
4433 const auto w2 = t2 & maskL;
4436 const auto t =
MulEven(a32, bH) + w2;
4439 const auto mulH =
MulEven(aH, bH) + w1 + k;
4448 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
4450 const auto b32 =
BitCast(du32, b);
4456 const auto aLbL =
MulEven(a32, b32);
4457 const auto w3 = aLbL & maskL;
4460 const auto w2 = t2 & maskL;
4463 const auto t =
MulEven(a32, bH) + w2;
4466 const auto mulH =
MulEven(aH, bH) + w1 + k;
4489 return Set(
d, _mm512_reduce_add_epi32(
v.raw));
4492 return Set(
d, _mm512_reduce_add_epi64(
v.raw));
4495 return Set(
d,
static_cast<uint32_t
>(_mm512_reduce_add_epi32(
v.raw)));
4498 return Set(
d,
static_cast<uint64_t
>(_mm512_reduce_add_epi64(
v.raw)));
4501 return Set(
d, _mm512_reduce_add_ps(
v.raw));
4504 return Set(
d, _mm512_reduce_add_pd(
v.raw));
4510 const auto sum =
SumOfLanes(d32, even + odd);
4519 const auto sum =
SumOfLanes(d32, even + odd);
4526 return Set(
d, _mm512_reduce_min_epi32(
v.raw));
4529 return Set(
d, _mm512_reduce_min_epi64(
v.raw));
4532 return Set(
d, _mm512_reduce_min_epu32(
v.raw));
4535 return Set(
d, _mm512_reduce_min_epu64(
v.raw));
4538 return Set(
d, _mm512_reduce_min_ps(
v.raw));
4541 return Set(
d, _mm512_reduce_min_pd(
v.raw));
4563 return Set(
d, _mm512_reduce_max_epi32(
v.raw));
4566 return Set(
d, _mm512_reduce_max_epi64(
v.raw));
4569 return Set(
d, _mm512_reduce_max_epu32(
v.raw));
4572 return Set(
d, _mm512_reduce_max_epu64(
v.raw));
4575 return Set(
d, _mm512_reduce_max_ps(
v.raw));
4578 return Set(
d, _mm512_reduce_max_pd(
v.raw));
uint8_t buf
Definition BitIO.h:84
size_t offset
Definition BitIO.h:80
uint32_t x
Definition BlockExec.h:38
uint8_t * bits
Definition TileProcessor.h:59
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_API
Definition base.h:129
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
Definition x86_128-inl.h:137
Raw raw
Definition arm_neon-inl.h:835
Definition x86_128-inl.h:70
Raw raw
Definition arm_neon-inl.h:814
Definition x86_256-inl.h:82
Raw raw
Definition x86_256-inl.h:113
Definition x86_512-inl.h:112
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition x86_512-inl.h:124
typename detail::Raw512< T >::type Raw
Definition x86_512-inl.h:113
Raw raw
Definition x86_512-inl.h:143
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition x86_512-inl.h:136
T PrivateT
Definition x86_512-inl.h:116
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition x86_512-inl.h:127
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition x86_512-inl.h:139
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition x86_512-inl.h:130
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition x86_512-inl.h:121
static constexpr size_t kPrivateN
Definition x86_512-inl.h:117
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition x86_512-inl.h:133
#define HWY_AVX3_DL
Definition detect_targets.h:65
#define HWY_TARGET
Definition detect_targets.h:380
uint32_t a
only used by MQ decoder
Definition mqc.h:48
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2413
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:718
HWY_INLINE Vec128< uint8_t, N > EmuCompress(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:3863
HWY_INLINE void NativeCompressStore(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask, Simd< uint8_t, N, 0 >, uint8_t *HWY_RESTRICT unaligned)
Definition x86_512-inl.h:3771
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition x86_128-inl.h:5571
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:1570
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:470
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:963
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:3661
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition x86_256-inl.h:2612
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition x86_128-inl.h:3286
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition x86_128-inl.h:3275
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:862
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition x86_512-inl.h:4252
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:4543
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5609
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:760
HWY_INLINE Vec128< uint8_t, N > NativeCompress(const Vec128< uint8_t, N > v, const Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:3743
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition generic_ops-inl.h:340
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition x86_512-inl.h:1613
HWY_INLINE void EmuCompressStore(Vec128< T, N > v, Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition x86_512-inl.h:3901
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
Simd< T, 64/sizeof(T), 0 > Full512
Definition x86_512-inl.h:154
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
HWY_API constexpr bool IsSigned()
Definition base.h:642
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:627
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:796
HWY_API constexpr bool IsFloat()
Definition base.h:635
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
Definition x86_512-inl.h:2545
__m512i raw
Definition x86_512-inl.h:2546
Definition x86_256-inl.h:143
Raw raw
Definition x86_256-inl.h:150
Definition x86_512-inl.h:148
typename detail::RawMask512< sizeof(T)>::type Raw
Definition x86_512-inl.h:149
Raw raw
Definition x86_512-inl.h:150
Definition ops/shared-inl.h:52
HWY_INLINE __m512d operator()(__m512i v)
Definition x86_512-inl.h:182
HWY_INLINE __m512 operator()(__m512i v)
Definition x86_512-inl.h:178
Definition x86_512-inl.h:173
HWY_INLINE __m512i operator()(__m512i v)
Definition x86_512-inl.h:174
__m512d type
Definition x86_512-inl.h:86
__m512 type
Definition x86_512-inl.h:82
Definition x86_512-inl.h:77
__m512i type
Definition x86_512-inl.h:78
__mmask64 type
Definition x86_512-inl.h:94
__mmask32 type
Definition x86_512-inl.h:98
__mmask16 type
Definition x86_512-inl.h:102
__mmask8 type
Definition x86_512-inl.h:106
Definition x86_512-inl.h:91
uint32_t x1
Definition t1_common.h:75