21#include <wasm_simd128.h>
26#ifdef HWY_WASM_OLD_NAMES
27#define wasm_i8x16_shuffle wasm_v8x16_shuffle
28#define wasm_i16x8_shuffle wasm_v16x8_shuffle
29#define wasm_i32x4_shuffle wasm_v32x4_shuffle
30#define wasm_i64x2_shuffle wasm_v64x2_shuffle
31#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
52#if HWY_TARGET == HWY_WASM_EMU256
70template <
typename T,
size_t N = 16 /
sizeof(T)>
76 static constexpr size_t kPrivateN =
N;
81 return *
this = (*
this * other);
84 return *
this = (*
this / other);
87 return *
this = (*
this + other);
90 return *
this = (*
this - other);
93 return *
this = (*
this & other);
96 return *
this = (*
this | other);
99 return *
this = (*
this ^ other);
106using Vec64 = Vec128<T, 8 /
sizeof(T)>;
109using Vec32 = Vec128<T, 4 /
sizeof(T)>;
115template <
typename T,
size_t N = 16 /
sizeof(T)>
124using TFromV =
typename V::PrivateT;
132 return static_cast<__v128_u
>(
v);
135 return static_cast<__v128_u
>(
v);
138template <
typename T,
size_t N>
145struct BitCastFromInteger128 {
149struct BitCastFromInteger128<float> {
153template <
typename T,
size_t N>
161template <
typename T,
size_t N,
typename FromT>
163 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
170template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
174template <
size_t N, HWY_IF_LE128(
float, N)>
185template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
189template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
194template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
199template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
205template <
size_t N, HWY_IF_LE128(
int8_t, N)>
209template <
size_t N, HWY_IF_LE128(
int16_t, N)>
213template <
size_t N, HWY_IF_LE128(
int32_t, N)>
217template <
size_t N, HWY_IF_LE128(
int64_t, N)>
222template <
size_t N, HWY_IF_LE128(
float, N)>
231template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
239template <
typename T,
size_t N,
typename T2, HWY_IF_LE128(T, N)>
242 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
246 return Load(
d, lanes);
457template <
int kBits,
size_t N>
461template <
int kBits,
size_t N>
465template <
int kBits,
size_t N>
469template <
int kBits,
size_t N>
473template <
int kBits,
size_t N>
477template <
int kBits,
size_t N>
483template <
int kBits,
size_t N>
487template <
int kBits,
size_t N>
491template <
int kBits,
size_t N>
495template <
int kBits,
size_t N>
499template <
int kBits,
size_t N>
503template <
int kBits,
size_t N>
509template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
516 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
519template <
int kBits,
size_t N>
525 return shifted &
Set(d8, 0xFF >> kBits);
528template <
int kBits,
size_t N>
533 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
534 return (shifted ^ shifted_sign) - shifted_sign;
538template <
int kBits,
typename T,
size_t N>
540 constexpr size_t kSizeInBits =
sizeof(T) * 8;
541 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
542 if (kBits == 0)
return v;
617template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
623 return shifted &
Set(d8,
static_cast<T
>((0xFF <<
bits) & 0xFF));
633 return shifted &
Set(d8, 0xFF >>
bits);
642 return (shifted ^ shifted_sign) - shifted_sign;
664HWY_API Vec128<uint64_t, N>
Min(Vec128<uint64_t, N>
a, Vec128<uint64_t, N> b) {
666 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(
a.raw, 0));
667 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
668 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(
a.raw, 1));
669 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
671 return Vec128<uint64_t, N>{wasm_v128_load(min)};
688HWY_API Vec128<int64_t, N>
Min(Vec128<int64_t, N>
a, Vec128<int64_t, N> b) {
689 alignas(16) int64_t min[4];
690 min[0] =
HWY_MIN(wasm_i64x2_extract_lane(
a.raw, 0),
691 wasm_i64x2_extract_lane(b.raw, 0));
692 min[1] =
HWY_MIN(wasm_i64x2_extract_lane(
a.raw, 1),
693 wasm_i64x2_extract_lane(b.raw, 1));
694 return Vec128<int64_t, N>{wasm_v128_load(min)};
721HWY_API Vec128<uint64_t, N>
Max(Vec128<uint64_t, N>
a, Vec128<uint64_t, N> b) {
723 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(
a.raw, 0));
724 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
725 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(
a.raw, 1));
726 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
728 return Vec128<uint64_t, N>{wasm_v128_load(max)};
745HWY_API Vec128<int64_t, N>
Max(Vec128<int64_t, N>
a, Vec128<int64_t, N> b) {
746 alignas(16) int64_t max[2];
747 max[0] =
HWY_MAX(wasm_i64x2_extract_lane(
a.raw, 0),
748 wasm_i64x2_extract_lane(b.raw, 0));
749 max[1] =
HWY_MAX(wasm_i64x2_extract_lane(
a.raw, 1),
750 wasm_i64x2_extract_lane(b.raw, 1));
751 return Vec128<int64_t, N>{wasm_v128_load(max)};
791 const Vec128<uint16_t, N> b) {
792 const auto l = wasm_u32x4_extmul_low_u16x8(
a.raw, b.raw);
793 const auto h = wasm_u32x4_extmul_high_u16x8(
a.raw, b.raw);
795 return Vec128<uint16_t, N>{
796 wasm_i16x8_shuffle(l,
h, 1, 3, 5, 7, 9, 11, 13, 15)};
800 const Vec128<int16_t, N> b) {
801 const auto l = wasm_i32x4_extmul_low_i16x8(
a.raw, b.raw);
802 const auto h = wasm_i32x4_extmul_high_i16x8(
a.raw, b.raw);
804 return Vec128<int16_t, N>{
805 wasm_i16x8_shuffle(l,
h, 1, 3, 5, 7, 9, 11, 13, 15)};
810 Vec128<int16_t, N> b) {
811 return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(
a.raw, b.raw)};
817 const Vec128<int32_t, N> b) {
818 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
819 const auto ae = wasm_v128_and(
a.raw, kEvenMask);
820 const auto be = wasm_v128_and(b.raw, kEvenMask);
821 return Vec128<int64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
825 const Vec128<uint32_t, N> b) {
826 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
827 const auto ae = wasm_v128_and(
a.raw, kEvenMask);
828 const auto be = wasm_v128_and(b.raw, kEvenMask);
829 return Vec128<uint64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
834template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
865 const Vec128<float, N> b) {
866 return Vec128<float, N>{wasm_f32x4_div(
a.raw, b.raw)};
872 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
879 const Vec128<float, N> b) {
888 const Vec128<float, N>
x,
889 const Vec128<float, N> add) {
890 return mul *
x + add;
896 const Vec128<float, N>
x,
897 const Vec128<float, N> add) {
898 return add - mul *
x;
904 const Vec128<float, N>
x,
905 const Vec128<float, N> sub) {
906 return mul *
x - sub;
912 const Vec128<float, N>
x,
913 const Vec128<float, N> sub) {
914 return Neg(mul) *
x - sub;
921HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
922 return Vec128<float, N>{wasm_f32x4_sqrt(
v.raw)};
929 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
930 return one /
Sqrt(
v);
938 return Vec128<float, N>{wasm_f32x4_nearest(
v.raw)};
944 return Vec128<float, N>{wasm_f32x4_trunc(
v.raw)};
949HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
950 return Vec128<float, N>{wasm_f32x4_ceil(
v.raw)};
956 return Vec128<float, N>{wasm_f32x4_floor(
v.raw)};
960template <
typename T,
size_t N>
965template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
967 const Simd<T, N, 0>
d;
975template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
977 const Simd<T, N, 0>
d;
984 const VFromD<
decltype(di)> exp =
993template <
typename TFrom,
typename TTo,
size_t N>
995 Mask128<TFrom, N> m) {
996 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
997 return Mask128<TTo, N>{m.raw};
1000template <
typename T,
size_t N>
1003 return (
v & bit) == bit;
1156 const auto b32 =
BitCast(d32, b);
1158 const auto m_gt = a32 > b32;
1161 const auto m_eq = a32 == b32;
1162 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1165 const auto gt =
Or(lo_gt, m_gt);
1176template <
typename T,
size_t N>
1197template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1207template <
typename T,
size_t N>
1209 return Vec128<T, N>{wasm_v128_not(
v.raw)};
1214template <
typename T,
size_t N>
1215HWY_API Vec128<T, N>
And(Vec128<T, N>
a, Vec128<T, N> b) {
1216 return Vec128<T, N>{wasm_v128_and(
a.raw, b.raw)};
1222template <
typename T,
size_t N>
1223HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1224 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1229template <
typename T,
size_t N>
1230HWY_API Vec128<T, N>
Or(Vec128<T, N>
a, Vec128<T, N> b) {
1231 return Vec128<T, N>{wasm_v128_or(
a.raw, b.raw)};
1236template <
typename T,
size_t N>
1237HWY_API Vec128<T, N>
Xor(Vec128<T, N>
a, Vec128<T, N> b) {
1238 return Vec128<T, N>{wasm_v128_xor(
a.raw, b.raw)};
1243template <
typename T,
size_t N>
1244HWY_API Vec128<T, N>
Xor3(Vec128<T, N>
x1, Vec128<T, N> x2, Vec128<T, N> x3) {
1250template <
typename T,
size_t N>
1251HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1252 return Or(o1,
Or(o2, o3));
1257template <
typename T,
size_t N>
1258HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1259 return Or(o,
And(a1, a2));
1264template <
typename T,
size_t N>
1272template <
typename T,
size_t N>
1277template <
typename T,
size_t N>
1282template <
typename T,
size_t N>
1289template <
typename T,
size_t N>
1291 const Vec128<T, N> sign) {
1292 static_assert(
IsFloat<T>(),
"Only makes sense for floating-point");
1297template <
typename T,
size_t N>
1299 const Vec128<T, N> sign) {
1300 static_assert(
IsFloat<T>(),
"Only makes sense for floating-point");
1306template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1319template <
typename T,
size_t N>
1321 return Mask128<T, N>{
v.raw};
1324template <
typename T,
size_t N>
1326 return Vec128<T, N>{
v.raw};
1330template <
typename T,
size_t N>
1333 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1337template <
typename T,
size_t N>
1343template <
typename T,
size_t N>
1348template <
typename T,
size_t N>
1351 static_assert(
IsSigned<T>(),
"Only works for signed/float");
1359template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1362 const auto zero =
Zero(
d);
1368template <
typename T,
size_t N>
1369HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1373template <
typename T,
size_t N>
1374HWY_API Mask128<T, N>
And(
const Mask128<T, N>
a, Mask128<T, N> b) {
1375 const Simd<T, N, 0>
d;
1379template <
typename T,
size_t N>
1380HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N>
a, Mask128<T, N> b) {
1381 const Simd<T, N, 0>
d;
1385template <
typename T,
size_t N>
1386HWY_API Mask128<T, N>
Or(
const Mask128<T, N>
a, Mask128<T, N> b) {
1387 const Simd<T, N, 0>
d;
1391template <
typename T,
size_t N>
1392HWY_API Mask128<T, N>
Xor(
const Mask128<T, N>
a, Mask128<T, N> b) {
1393 const Simd<T, N, 0>
d;
1397template <
typename T,
size_t N>
1399 const Simd<T, N, 0>
d;
1413template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1438template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1467template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1470 alignas(16) T lanes[2];
1471 alignas(16) T bits_lanes[2];
1474 lanes[0] <<= bits_lanes[0];
1475 lanes[1] <<= bits_lanes[1];
1476 return Load(
d, lanes);
1481template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1506template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1539template <
typename T>
1541 return Vec128<T>{wasm_v128_load(aligned)};
1544template <
typename T,
size_t N>
1551template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1559template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1565template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1572template <
typename T>
1574 wasm_v128_store(aligned,
v.raw);
1578template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1585 *p = wasm_f32x4_extract_lane(
v.raw, 0);
1589template <
typename T,
size_t N>
1594template <
typename T,
size_t N>
1604template <
typename T,
size_t N>
1607 wasm_v128_store(aligned,
v.raw);
1612template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
1615 const Vec128<Offset, N>
offset) {
1616 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1618 alignas(16) T lanes[
N];
1621 alignas(16) Offset offset_lanes[
N];
1624 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1625 for (
size_t i = 0; i <
N; ++i) {
1630template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
1632 const Vec128<Index, N> index) {
1633 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1635 alignas(16) T lanes[
N];
1638 alignas(16) Index index_lanes[
N];
1639 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
1641 for (
size_t i = 0; i <
N; ++i) {
1642 base[index_lanes[i]] = lanes[i];
1648template <
typename T,
size_t N,
typename Offset>
1651 const Vec128<Offset, N>
offset) {
1652 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1654 alignas(16) Offset offset_lanes[
N];
1657 alignas(16) T lanes[
N];
1658 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1659 for (
size_t i = 0; i <
N; ++i) {
1662 return Load(
d, lanes);
1665template <
typename T,
size_t N,
typename Index>
1668 const Vec128<Index, N> index) {
1669 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1671 alignas(16) Index index_lanes[
N];
1672 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
1674 alignas(16) T lanes[
N];
1675 for (
size_t i = 0; i <
N; ++i) {
1676 lanes[i] = base[index_lanes[i]];
1678 return Load(
d, lanes);
1687template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1689 return static_cast<T
>(wasm_i8x16_extract_lane(
v.raw, kLane));
1691template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1693 return static_cast<T
>(wasm_i16x8_extract_lane(
v.raw, kLane));
1695template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1697 return static_cast<T
>(wasm_i32x4_extract_lane(
v.raw, kLane));
1699template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1701 return static_cast<T
>(wasm_i64x2_extract_lane(
v.raw, kLane));
1704template <
size_t kLane,
size_t N>
1706 return wasm_f32x4_extract_lane(
v.raw, kLane);
1714template <
typename T>
1721template <
typename T>
1723#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1724 if (__builtin_constant_p(i)) {
1733 alignas(16) T lanes[2];
1738template <
typename T>
1740#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1741 if (__builtin_constant_p(i)) {
1754 alignas(16) T lanes[4];
1759template <
typename T>
1761#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1762 if (__builtin_constant_p(i)) {
1783 alignas(16) T lanes[8];
1788template <
typename T>
1790#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1791 if (__builtin_constant_p(i)) {
1828 alignas(16) T lanes[16];
1834template <
typename T,
size_t N>
1843template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1845 static_assert(kLane <
N,
"Lane index out of bounds");
1847 wasm_i8x16_replace_lane(
v.raw, kLane,
static_cast<int8_t
>(t))};
1850template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1852 static_assert(kLane <
N,
"Lane index out of bounds");
1854 wasm_i16x8_replace_lane(
v.raw, kLane,
static_cast<int16_t
>(t))};
1857template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1859 static_assert(kLane <
N,
"Lane index out of bounds");
1860 return Vec128<T, N>{
1861 wasm_i32x4_replace_lane(
v.raw, kLane,
static_cast<int32_t
>(t))};
1864template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1866 static_assert(kLane <
N,
"Lane index out of bounds");
1867 return Vec128<T, N>{
1868 wasm_i64x2_replace_lane(
v.raw, kLane,
static_cast<int64_t
>(t))};
1871template <
size_t kLane,
size_t N>
1873 static_assert(kLane <
N,
"Lane index out of bounds");
1877template <
size_t kLane,
size_t N>
1879 static_assert(kLane < 2,
"Lane index out of bounds");
1888template <
typename T>
1895template <
typename T>
1897#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1898 if (__builtin_constant_p(i)) {
1908 alignas(16) T lanes[2];
1911 return Load(
d, lanes);
1914template <
typename T>
1916#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1917 if (__builtin_constant_p(i)) {
1931 alignas(16) T lanes[4];
1934 return Load(
d, lanes);
1937template <
typename T>
1939#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1940 if (__builtin_constant_p(i)) {
1962 alignas(16) T lanes[8];
1965 return Load(
d, lanes);
1968template <
typename T>
1970#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1971 if (__builtin_constant_p(i)) {
2009 alignas(16) T lanes[16];
2012 return Load(
d, lanes);
2017template <
typename T,
size_t N>
2020 return Vec128<T,
N / 2>{
v.raw};
2023template <
typename T,
size_t N>
2031template <
int kBytes,
typename T,
size_t N>
2033 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2034 const __i8x16 zero = wasm_i8x16_splat(0);
2040 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
2041 6, 7, 8, 9, 10, 11, 12, 13, 14)};
2044 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
2045 5, 6, 7, 8, 9, 10, 11, 12, 13)};
2048 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 0, 1, 2,
2049 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2052 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 0, 1,
2053 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2056 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 0,
2057 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2060 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2061 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2064 return Vec128<T, N>{wasm_i8x16_shuffle(
2065 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2068 return Vec128<T, N>{wasm_i8x16_shuffle(
2069 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2072 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2073 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
2077 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2078 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
2082 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2083 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
2087 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2088 16, 16, 16, 16, 16, 16, 16, 0, 1,
2092 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2093 16, 16, 16, 16, 16, 16, 16, 16, 0,
2097 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2098 16, 16, 16, 16, 16, 16, 16, 16, 16,
2102 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2103 16, 16, 16, 16, 16, 16, 16, 16, 16,
2106 return Vec128<T, N>{zero};
2109template <
int kBytes,
typename T,
size_t N>
2116template <
int kLanes,
typename T,
size_t N>
2122template <
int kLanes,
typename T,
size_t N>
2131template <
int kBytes,
typename T,
size_t N>
2133 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2134 const __i8x16 zero = wasm_i8x16_splat(0);
2141 return wasm_i8x16_shuffle(
v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2142 12, 13, 14, 15, 16);
2145 return wasm_i8x16_shuffle(
v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2146 13, 14, 15, 16, 16);
2149 return wasm_i8x16_shuffle(
v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2150 13, 14, 15, 16, 16, 16);
2153 return wasm_i8x16_shuffle(
v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2154 14, 15, 16, 16, 16, 16);
2157 return wasm_i8x16_shuffle(
v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2158 15, 16, 16, 16, 16, 16);
2161 return wasm_i8x16_shuffle(
v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2162 16, 16, 16, 16, 16, 16);
2165 return wasm_i8x16_shuffle(
v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2166 16, 16, 16, 16, 16, 16, 16);
2169 return wasm_i8x16_shuffle(
v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2170 16, 16, 16, 16, 16, 16, 16);
2173 return wasm_i8x16_shuffle(
v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2174 16, 16, 16, 16, 16, 16, 16);
2177 return wasm_i8x16_shuffle(
v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2178 16, 16, 16, 16, 16, 16, 16);
2181 return wasm_i8x16_shuffle(
v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2182 16, 16, 16, 16, 16, 16, 16);
2185 return wasm_i8x16_shuffle(
v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2186 16, 16, 16, 16, 16, 16, 16);
2189 return wasm_i8x16_shuffle(
v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2190 16, 16, 16, 16, 16, 16, 16);
2193 return wasm_i8x16_shuffle(
v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2194 16, 16, 16, 16, 16, 16, 16);
2197 return wasm_i8x16_shuffle(
v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2198 16, 16, 16, 16, 16, 16, 16);
2207template <
int kBytes,
typename T,
size_t N>
2210 if (
N != 16 /
sizeof(T)) {
2211 const Vec128<T> vfull{
v.raw};
2218template <
int kLanes,
typename T,
size_t N>
2227template <
typename T>
2229 return Vec64<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
2232 return Vec64<float>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
2236template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2243 return Vec128<T, (
N + 1) / 2>{upper.raw};
2248template <
int kBytes,
typename T,
class V = Vec128<T>>
2250 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2256 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2257 11, 12, 13, 14, 15, 16)};
2260 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2261 11, 12, 13, 14, 15, 16, 17)};
2264 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2265 12, 13, 14, 15, 16, 17, 18)};
2268 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2269 13, 14, 15, 16, 17, 18, 19)};
2272 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2273 14, 15, 16, 17, 18, 19, 20)};
2276 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
2277 14, 15, 16, 17, 18, 19, 20, 21)};
2280 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
2281 15, 16, 17, 18, 19, 20, 21, 22)};
2284 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
2285 16, 17, 18, 19, 20, 21, 22, 23)};
2288 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
2289 17, 18, 19, 20, 21, 22, 23, 24)};
2292 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
2293 17, 18, 19, 20, 21, 22, 23, 24, 25)};
2296 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
2297 18, 19, 20, 21, 22, 23, 24, 25, 26)};
2300 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
2301 19, 20, 21, 22, 23, 24, 25, 26, 27)};
2304 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
2305 20, 21, 22, 23, 24, 25, 26, 27, 28)};
2308 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
2309 21, 22, 23, 24, 25, 26, 27, 28, 29)};
2312 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2313 22, 23, 24, 25, 26, 27, 28, 29, 30)};
2318template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
2319 class V = Vec128<T, N>>
2321 constexpr size_t kSize =
N *
sizeof(T);
2322 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
2325 using V8 =
VFromD<
decltype(d_full8)>;
2326 const V8 hi8{
BitCast(d8, hi).raw};
2335template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2337 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2338 return Vec128<T, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, kLane, kLane, kLane,
2339 kLane, kLane, kLane, kLane, kLane)};
2342template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2344 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2345 return Vec128<T, N>{
2346 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
2349template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2351 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2352 return Vec128<T, N>{wasm_i64x2_shuffle(
v.raw,
v.raw, kLane, kLane)};
2359template <
typename T,
size_t N,
typename TI,
size_t NI>
2361 const Vec128<TI, NI> from) {
2367 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2369 alignas(16) uint8_t control[16];
2370 alignas(16) uint8_t input[16];
2371 alignas(16) uint8_t output[16];
2372 wasm_v128_store(control, from.raw);
2373 wasm_v128_store(input, bytes.raw);
2374 for (
size_t i = 0; i < 16; ++i) {
2375 output[i] = control[i] < 16 ? input[control[i]] : 0;
2377 return Vec128<TI, NI>{wasm_v128_load(output)};
2381template <
typename T,
size_t N,
typename TI,
size_t NI>
2383 const Vec128<TI, NI> from) {
2384 const Simd<TI, NI, 0>
d;
2402template <
typename T,
size_t N>
2404 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2405 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2406 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
2412template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2414 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2416 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2417 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2419template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2421 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2423 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2425template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2427 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2428 return Vec128<T, N>{wasm_i32x4_shuffle(
a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2431template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2433 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2435 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2436 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2438template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2440 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2442 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2444template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2446 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2447 return Vec128<T, N>{wasm_i32x4_shuffle(
a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2450template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2452 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2454 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2455 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2457template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2459 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2461 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2463template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2465 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2466 return Vec128<T, N>{wasm_i32x4_shuffle(
a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2472template <
typename T>
2474 static_assert(
sizeof(T) == 8,
"Only for 64-bit lanes");
2475 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2477template <
typename T>
2479 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2480 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2484template <
typename T>
2486 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2487 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 2, 3, 0)};
2491template <
typename T>
2493 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2494 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 0, 1, 2)};
2498template <
typename T>
2500 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2501 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 2, 1, 0)};
2507template <
typename T,
size_t N = 16 /
sizeof(T)>
2512template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2514 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2515#if HWY_IS_DEBUG_BUILD
2516 const Rebind<TI,
decltype(
d)> di;
2518 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(
N)))));
2522 using V8 =
VFromD<
decltype(d8)>;
2526 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
2527 if (
sizeof(T) == 4) {
2528 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
2529 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2530 const V8 lane_indices =
2532 const V8 byte_indices =
2534 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2535 0, 1, 2, 3, 0, 1, 2, 3};
2538 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
2539 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2540 const V8 lane_indices =
2542 const V8 byte_indices =
2544 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2545 0, 1, 2, 3, 4, 5, 6, 7};
2546 return Indices128<T, N>{
Add(byte_indices,
Load(d8, kByteOffsets)).raw};
2550template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2552 const Rebind<TI,
decltype(
d)> di;
2556template <
typename T,
size_t N>
2560 const Rebind<TI,
decltype(
d)> di;
2567template <
typename T>
2573template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2575 return Vec128<T, 2>{
Shuffle2301(Vec128<T>{
v.raw}).raw};
2578template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2584template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2590template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2598template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2604template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2609template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2616template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2618 return BitCast(
d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, 3, 2,
2619 1, 0, 7, 6, 5, 4)});
2622template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2627template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2634template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2639template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2650 a.raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2656 wasm_i16x8_shuffle(
a.raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2673 a.raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2679 wasm_i16x8_shuffle(
a.raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2719 26, 11, 27, 12, 28, 13, 29, 14,
2726 wasm_i16x8_shuffle(
a.raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2743 26, 11, 27, 12, 28, 13, 29, 14,
2750 wasm_i16x8_shuffle(
a.raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2778template <
typename T,
class V = Vec128<T>>
2784template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
2786 const Half<
decltype(
d)> d2;
2794template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2798template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2803template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2813template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2814HWY_API Vec128<T, N>
Combine(Simd<T, N, 0>
d, Vec128<T, N / 2> hi_half,
2815 Vec128<T, N / 2> lo_half) {
2816 const Half<
decltype(
d)> d2;
2820 const VU lo{
BitCast(du2, lo_half).raw};
2821 const VU hi{
BitCast(du2, hi_half).raw};
2827template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2835template <
typename T>
2840template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2842 const Vec128<T, N> lo) {
2843 const Half<
decltype(
d)> d2;
2849template <
typename T>
2854template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2856 const Vec128<T, N> lo) {
2857 const Half<
decltype(
d)> d2;
2863template <
typename T>
2868template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2870 const Vec128<T, N> lo) {
2871 const Half<
decltype(
d)> d2;
2876template <
typename T,
size_t N>
2878 const Vec128<T, N> lo) {
2885template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2887 return Vec128<T>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 1, 3, 5, 7, 9, 11, 13, 15,
2888 17, 19, 21, 23, 25, 27, 29, 31)};
2892template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2896 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 1, 3, 5, 7, 17, 19, 21,
2897 23, 1, 3, 5, 7, 17, 19, 21, 23)};
2901template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2905 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
2906 19, 1, 3, 17, 19, 1, 3, 17, 19)};
2910template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2913 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
2917template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2921 return Vec128<T, 4>{
2922 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
2926template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2928 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2932template <
typename T>
2941template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2943 return Vec128<T>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 0, 2, 4, 6, 8, 10, 12, 14,
2944 16, 18, 20, 22, 24, 26, 28, 30)};
2948template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2952 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 0, 2, 4, 6, 16, 18, 20,
2953 22, 0, 2, 4, 6, 16, 18, 20, 22)};
2957template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2961 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
2962 18, 0, 2, 16, 18, 0, 2, 16, 18)};
2966template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2969 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
2973template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2977 return Vec128<T, 4>{
2978 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
2982template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2984 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2988template <
typename T>
2996template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2998 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 0, 0, 2, 2)};
3001template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3008template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3010 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 1, 3, 3)};
3013template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3022template <
typename T,
size_t N>
3027 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3028 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3031template <
typename T,
size_t N>
3035 wasm_i16x8_shuffle(
a.raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3037template <
typename T,
size_t N>
3042template <
typename T,
size_t N>
3050template <
typename T,
size_t N>
3051HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N>
a,
const Vec128<T, N> b) {
3061template <
typename T,
size_t N>
3068template <
typename T,
size_t N>
3076template <
typename T>
3086template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
3088 const Vec128<uint8_t, N>
v) {
3089 return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(
v.raw)};
3091template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
3093 const Vec128<uint8_t, N>
v) {
3094 return Vec128<uint32_t, N>{
3095 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
3097template <
size_t N, HWY_IF_LE128(
int16_t, N)>
3099 const Vec128<uint8_t, N>
v) {
3100 return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(
v.raw)};
3102template <
size_t N, HWY_IF_LE128(
int32_t, N)>
3104 const Vec128<uint8_t, N>
v) {
3105 return Vec128<int32_t, N>{
3106 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
3108template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
3113template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
3115 const Vec128<uint32_t, N>
v) {
3116 return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(
v.raw)};
3119template <
size_t N, HWY_IF_LE128(
int32_t, N)>
3121 const Vec128<uint16_t, N>
v) {
3122 return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(
v.raw)};
3126template <
size_t N, HWY_IF_LE128(
int16_t, N)>
3131template <
size_t N, HWY_IF_LE128(
int32_t, N)>
3135 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(
v.raw))};
3137template <
size_t N, HWY_IF_LE128(
int32_t, N)>
3142template <
size_t N, HWY_IF_LE128(
int64_t, N)>
3148template <
size_t N, HWY_IF_LE128(
double, N)>
3154template <
size_t N, HWY_IF_LE128(
float, N)>
3163 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3164 const auto subnormal =
3166 Set(df32, 1.0f / 16384 / 1024));
3168 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3169 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3170 const auto normal =
ShiftLeft<23>(biased_exp32) | mantissa32;
3171 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3175template <
size_t N, HWY_IF_LE128(
float, N)>
3178 const Rebind<uint16_t,
decltype(df32)> du16;
3200 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3202 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3214 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3226 const Vec128<double, N>
v) {
3227 return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(
v.raw)};
3232 const Vec128<float, N>
v) {
3234 const Rebind<uint32_t,
decltype(du16)> du;
3236 const auto bits32 =
BitCast(du,
v);
3239 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3241 const auto k15 =
Set(di, 15);
3242 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3243 const auto is_tiny = exp <
Set(di, -24);
3245 const auto is_subnormal = exp <
Set(di, -14);
3246 const auto biased_exp16 =
3248 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3249 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3250 (mantissa32 >> (
Set(du, 13) + sub_exp));
3255 const auto normal16 = sign16 |
ShiftLeft<10>(biased_exp16) | mantissa16;
3257 return Vec128<float16_t, N>{
DemoteTo(du16, bits16).raw};
3262 const Vec128<float, N>
v) {
3263 const Rebind<int32_t,
decltype(dbf16)> di32;
3264 const Rebind<uint32_t,
decltype(dbf16)> du32;
3265 const Rebind<uint16_t,
decltype(dbf16)> du16;
3272 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N>
a, Vec128<float, N> b) {
3274 const Repartition<uint32_t,
decltype(dbf16)> du32;
3285 const Half<
decltype(dn)> dnh;
3294 const Half<
decltype(dn)> dnh;
3301 Vec128<int32_t>
a, Vec128<int32_t> b) {
3302 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(
a.raw, b.raw)};
3308 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3309 return Vec128<uint8_t, N>{
3310 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3319 const Vec128<From, 1>
v) {
3322 return Vec128<To, 1>{v1.raw};
3349template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
3351 const Vec128<uint32_t, N>
v) {
3353 const auto v1 = Vec128<uint8_t>{
v.raw};
3356 return Vec128<uint8_t, N>{v3.raw};
3359template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
3361 const Vec128<uint32_t, N>
v) {
3363 const auto v1 = Vec128<uint16_t>{
v.raw};
3365 return Vec128<uint16_t, N>{v2.raw};
3368template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
3370 const Vec128<uint16_t, N>
v) {
3372 const auto v1 = Vec128<uint8_t>{
v.raw};
3374 return Vec128<uint8_t, N>{v2.raw};
3406 const DFromV<
decltype(
v)> du8;
3410 using VU16 =
VFromD<
decltype(du16)>;
3414 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
3416 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
3418 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
3419 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
3420 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
3422 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
3423 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
3424 return And(
BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70),
Set(du64, 0xFFFF));
3431template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3436 const Vec128<T, N> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(
bits))};
3439 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
3440 1, 1, 1, 1, 1, 1, 1, 1};
3443 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3444 1, 2, 4, 8, 16, 32, 64, 128};
3448template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3451 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3456template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3459 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
3464template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3467 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
3474template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3477 uint64_t mask_bits = 0;
3487template <
typename T>
3489 const Mask128<T> mask) {
3490 alignas(16) uint64_t lanes[2];
3491 wasm_v128_store(lanes, mask.raw);
3493 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3494 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
3495 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
3500template <
typename T>
3503 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3504 return (
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0)) *
3510template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3513 uint64_t bytes =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0));
3515 bytes &= (1ULL << (
N * 8)) - 1;
3516 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3517 return (bytes * kMagic) >> 56;
3520template <
typename T,
size_t N>
3524 const __i16x8 zero = wasm_i16x8_splat(0);
3529template <
typename T,
size_t N>
3532 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
3533 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
3534 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
3535 alignas(16) uint32_t lanes[4];
3536 wasm_v128_store(lanes, sliced_mask);
3537 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
3540template <
typename T,
size_t N>
3543 const __i64x2 mask_i =
static_cast<__i64x2
>(mask.
raw);
3544 const __i64x2 slice = wasm_i64x2_make(1, 2);
3545 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
3546 alignas(16) uint64_t lanes[2];
3547 wasm_v128_store(lanes, sliced_mask);
3548 return lanes[0] | lanes[1];
3552template <
typename T,
size_t N>
3554 return ((
N *
sizeof(T)) == 16) ?
bits :
bits & ((1ull <<
N) - 1);
3561 (
N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3562 : (
N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3563 : (
N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3564 : (
N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3565 : (
N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3566 : (
N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3567 : (
N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3568 : (
N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3569 : (
N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3570 : (
N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3572 : (
N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3574 : (
N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3576 : (
N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3578 : (
N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3581 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3583 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3584 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3587template <
typename T,
size_t N>
3592template <
typename T>
3597template <
typename T>
3602template <
typename T>
3604 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3605 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3606 alignas(16) uint64_t lanes[2];
3607 wasm_v128_store(lanes, shifted_bits);
3608 return PopCount(lanes[0] | lanes[1]);
3611template <
typename T>
3613 alignas(16) int64_t lanes[2];
3614 wasm_v128_store(lanes, m.raw);
3615 return static_cast<size_t>(-(lanes[0] + lanes[1]));
3621template <
typename T,
size_t N>
3623 const Mask128<T, N> mask, uint8_t*
bits) {
3625 const size_t kNumBytes = (
N + 7) / 8;
3630template <
typename T,
size_t N>
3636template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3644template <
typename T>
3650 return !wasm_i8x16_any_true(v8.raw);
3653 return (wasm_i64x2_extract_lane(m.
raw, 0) |
3654 wasm_i64x2_extract_lane(m.
raw, 1)) == 0;
3660template <
typename T>
3662 return wasm_i8x16_all_true(m.
raw);
3664template <
typename T>
3666 return wasm_i16x8_all_true(m.
raw);
3668template <
typename T>
3670 return wasm_i32x4_all_true(m.
raw);
3672template <
typename T>
3674 return wasm_i64x2_all_true(m.
raw);
3679template <
typename T,
size_t N>
3686template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3693template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3694HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> m) {
3700template <
typename T,
size_t N>
3702 const Mask128<T, N> mask) {
3707template <
typename T,
size_t N>
3709 const Mask128<T, N> mask) {
3718template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3722 const Rebind<uint8_t,
decltype(
d)> d8;
3730 alignas(16)
constexpr uint8_t table[256 * 8] = {
3732 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3733 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3734 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
3735 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3736 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
3737 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
3738 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
3739 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3740 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
3741 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
3742 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
3743 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
3744 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
3745 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
3746 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
3747 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3748 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
3749 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
3750 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
3751 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
3752 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
3753 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
3754 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
3755 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
3756 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
3757 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
3758 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
3759 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
3760 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
3761 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
3762 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
3763 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3764 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
3765 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
3766 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
3767 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
3768 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
3769 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
3770 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
3771 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
3772 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
3773 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
3774 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
3775 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
3776 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
3777 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
3778 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
3779 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
3780 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
3781 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
3782 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
3783 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
3784 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
3785 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
3786 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
3787 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
3788 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
3789 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
3790 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
3791 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
3792 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
3793 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
3794 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
3795 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3796 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
3797 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
3798 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
3799 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
3800 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
3801 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
3802 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
3803 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
3804 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
3805 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
3806 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
3807 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
3808 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
3809 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
3810 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
3811 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
3812 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
3813 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
3814 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
3815 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
3816 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
3817 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
3818 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
3819 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
3820 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
3821 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
3822 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
3823 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
3824 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
3825 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
3826 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
3827 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
3828 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
3829 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
3830 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
3831 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
3832 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
3833 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
3834 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
3835 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
3836 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
3837 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
3838 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
3839 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
3840 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
3841 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
3842 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
3843 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
3844 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
3845 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
3846 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
3847 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
3848 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
3849 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
3850 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
3851 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
3852 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
3853 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
3854 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
3855 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
3856 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
3857 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
3858 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
3859 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3866template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3870 const Rebind<uint8_t,
decltype(
d)> d8;
3878 alignas(16)
constexpr uint8_t table[256 * 8] = {
3880 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
3881 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
3882 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
3883 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
3884 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
3885 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
3886 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
3887 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
3888 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
3889 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
3890 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
3891 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
3892 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
3893 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
3894 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
3895 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
3896 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
3897 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
3898 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
3899 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
3900 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
3901 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
3902 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
3903 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
3904 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
3905 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
3906 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
3907 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
3908 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
3909 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
3910 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
3911 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
3912 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
3913 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
3914 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
3915 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
3916 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
3917 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
3918 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
3919 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
3920 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
3921 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
3922 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
3923 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
3924 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
3925 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
3926 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
3927 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
3928 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
3929 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
3930 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
3931 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
3932 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
3933 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
3934 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
3935 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
3936 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
3937 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
3938 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
3939 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
3940 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
3941 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
3942 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
3943 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
3944 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
3945 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
3946 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
3947 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
3948 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
3949 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
3950 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
3951 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
3952 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
3953 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
3954 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
3955 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
3956 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
3957 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
3958 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
3959 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
3960 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
3961 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
3962 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
3963 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
3964 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
3965 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
3966 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
3967 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
3968 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
3969 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
3970 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
3971 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
3972 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
3973 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
3974 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
3975 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
3976 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
3977 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
3978 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
3979 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
3980 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
3981 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
3982 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
3983 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
3984 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
3985 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
3986 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
3987 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
3988 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
3989 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
3990 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
3991 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
3992 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
3993 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
3994 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
3995 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
3996 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
3997 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
3998 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
3999 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
4000 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
4001 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
4002 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
4003 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
4004 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
4005 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
4006 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
4007 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
4014template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4019 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
4021 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4022 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4023 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
4024 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4025 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
4026 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
4027 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
4028 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4029 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
4030 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
4031 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
4032 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
4033 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4034 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
4035 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
4036 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4037 const Simd<T, N, 0>
d;
4039 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
4042template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4047 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
4049 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
4050 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
4051 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
4052 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
4053 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
4054 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
4055 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4056 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4057 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
4058 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
4059 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
4060 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
4061 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4062 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
4064 const Simd<T, N, 0>
d;
4066 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
4069template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4074 alignas(16)
constexpr uint8_t u8_indices[4 * 16] = {
4076 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4077 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4078 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4079 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4081 const Simd<T, N, 0>
d;
4083 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
4086template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4091 alignas(16)
constexpr uint8_t u8_indices[4 * 16] = {
4093 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4094 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4095 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4096 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4098 const Simd<T, N, 0>
d;
4100 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
4106template <
typename T,
size_t N>
4114template <
typename T,
size_t N>
4124template <
typename T>
4125struct CompressIsPartition {
4126#if HWY_TARGET == HWY_WASM_EMU256
4129 enum {
value = (
sizeof(T) != 1) };
4134template <
typename T>
4140template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4152template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
4158template <
typename T>
4164template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4169 const Vec128<T> maskL =
DupEven(m);
4170 const Vec128<T> maskH =
DupOdd(m);
4171 const Vec128<T> swap =
AndNot(maskH, maskL);
4176template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
4180 if (
N < 16 /
sizeof(T)) {
4188 Mask128<uint64_t> ) {
4193template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4196 uint64_t mask_bits = 0;
4197 constexpr size_t kNumBytes = (
N + 7) / 8;
4200 mask_bits &= (1ull <<
N) - 1;
4207template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4217template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4222 using TU =
TFromD<
decltype(du)>;
4224 const size_t count =
PopCount(mask_bits);
4233template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
4237 uint64_t mask_bits = 0;
4238 constexpr size_t kNumBytes = (
N + 7) / 8;
4241 mask_bits &= (1ull <<
N) - 1;
4257 const Vec128<uint64_t> b) {
4258 alignas(16) uint64_t mul[2];
4260 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(
a.raw, 0)),
4261 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
4266 const Vec128<uint64_t> b) {
4267 alignas(16) uint64_t mul[2];
4269 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(
a.raw, 1)),
4270 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
4278 Vec128<bfloat16_t, 2 * N>
a,
4279 Vec128<bfloat16_t, 2 * N> b,
4280 const Vec128<float, N> sum0,
4281 Vec128<float, N>& sum1) {
4282 const Rebind<uint32_t,
decltype(df32)> du32;
4283 using VU32 =
VFromD<
decltype(du32)>;
4284 const VU32 odd =
Set(du32, 0xFFFF0000u);
4299 Simd<int32_t, N, 0> , Vec128<int16_t, 2 * N>
a,
4300 Vec128<int16_t, 2 * N> b,
const Vec128<int32_t, N> sum0,
4301 Vec128<int32_t, N>& ) {
4302 return sum0 + Vec128<int32_t, N>{wasm_i32x4_dot_i16x8(
a.raw, b.raw)};
4314 const Vec128<float, N> sum1) {
4315 return Add(sum0, sum1);
4323template <
typename T>
4325 const Vec128<T, 1>
v) {
4328template <
typename T>
4330 const Vec128<T, 1>
v) {
4333template <
typename T>
4335 const Vec128<T, 1>
v) {
4342template <
typename T>
4347template <
typename T>
4352template <
typename T>
4354 const Vec128<T, 2> v10) {
4355 return Max(v10, Vec128<T, 2>{
Shuffle2301(Vec128<T>{v10.raw}).raw});
4359template <
typename T>
4363 const Vec128<T> v31_20_31_20 = v3210 + v1032;
4365 return v20_31_20_31 + v31_20_31_20;
4367template <
typename T>
4373 return Min(v20_31_20_31, v31_20_31_20);
4375template <
typename T>
4377 const Vec128<T> v3210) {
4379 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
4380 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4381 return Max(v20_31_20_31, v31_20_31_20);
4387template <
typename T>
4393template <
typename T>
4397 return Min(v10, v01);
4399template <
typename T>
4403 return Max(v10, v01);
4406template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
4417template <
size_t N, HWY_IF_GE32(
int16_t, N)>
4419 Vec128<int16_t, N>
v) {
4420 const Simd<int16_t, N, 0>
d;
4430template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
4432 Vec128<uint16_t, N>
v) {
4433 const Simd<uint16_t, N, 0>
d;
4441template <
size_t N, HWY_IF_GE32(
int16_t, N)>
4443 Vec128<int16_t, N>
v) {
4444 const Simd<int16_t, N, 0>
d;
4454template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
4456 Vec128<uint16_t, N>
v) {
4457 const Simd<uint16_t, N, 0>
d;
4465template <
size_t N, HWY_IF_GE32(
int16_t, N)>
4467 Vec128<int16_t, N>
v) {
4468 const Simd<int16_t, N, 0>
d;
4481template <
typename T,
size_t N>
4485template <
typename T,
size_t N>
4489template <
typename T,
size_t N>
4496template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4499 static_assert(!
IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
4513 const Mask128<T, N> eqHL = Eq(
a, b);
4519 const Vec128<T, N> ltLx =
DupEven(ltHL);
4520 const Vec128<T, N> outHx =
IfThenElse(eqHL, ltLx, ltHL);
4524template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4533template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4536 static_assert(!
IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
4541template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4550template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4553 static_assert(!
IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
4558template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
size_t offset
Definition BitIO.h:80
uint32_t x
Definition BlockExec.h:38
uint8_t * bits
Definition TileProcessor.h:59
#define HWY_MAX(a, b)
Definition base.h:135
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_IF_LE64(T, N)
Definition base.h:407
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_ASSERT(condition)
Definition base.h:192
#define HWY_IF_UNSIGNED(T)
Definition base.h:414
Definition x86_128-inl.h:137
detail::Raw128< T >::type raw
Definition wasm_128-inl.h:117
Raw raw
Definition arm_neon-inl.h:835
Definition x86_128-inl.h:70
T PrivateT
Definition arm_neon-inl.h:782
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition wasm_128-inl.h:83
Raw raw
Definition arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition wasm_128-inl.h:89
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition wasm_128-inl.h:98
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition wasm_128-inl.h:95
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition wasm_128-inl.h:80
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:779
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition wasm_128-inl.h:92
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition wasm_128-inl.h:86
uint32_t a
only used by MQ decoder
Definition mqc.h:48
uint32_t c
temporary buffer where bits are coded or decoded
Definition mqc.h:46
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2413
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition wasm_128-inl.h:2132
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
constexpr __i8x16 BytesAbove()
Definition wasm_128-inl.h:3559
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:3661
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5902
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition wasm_128-inl.h:1688
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition wasm_128-inl.h:1844
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:663
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6162
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5609
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:671
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5750
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:5589
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition arm_neon-inl.h:5364
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:821
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:818
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
Simd< T, 8/sizeof(T), 0 > Full64
Definition ops/shared-inl.h:240
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
constexpr MakeSigned< T > MaxExponentTimes2()
Definition base.h:728
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:924
HWY_API constexpr bool IsSigned()
Definition base.h:642
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:607
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:627
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
constexpr int MantissaBits()
Definition base.h:712
constexpr MakeSigned< T > MaxExponentField()
Definition base.h:778
typename EnableIfT< Condition >::type EnableIf
Definition base.h:383
HWY_API constexpr bool IsFloat()
Definition base.h:635
typename detail::Relations< T >::Wide MakeWide
Definition base.h:601
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
@ value
Definition arm_neon-inl.h:5730
Definition x86_128-inl.h:4130
__v128_u raw
Definition wasm_128-inl.h:2509
Definition ops/shared-inl.h:52
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition wasm_128-inl.h:150
Definition x86_128-inl.h:178
HWY_INLINE __v128_u operator()(__v128_u v)
Definition wasm_128-inl.h:146
__f32x4 type
Definition wasm_128-inl.h:65
Definition x86_128-inl.h:55
__v128_u type
Definition wasm_128-inl.h:61
uint32_t x1
Definition t1_common.h:75