35 return *
this = (*
this * other);
38 return *
this = (*
this / other);
41 return *
this = (*
this + other);
44 return *
this = (*
this - other);
47 return *
this = (*
this & other);
50 return *
this = (*
this | other);
53 return *
this = (*
this ^ other);
68template <
typename T,
typename FromT>
70 const Half<
decltype(
d)> dh;
81 const Half<
decltype(
d)> dh;
83 ret.
v0 = ret.v1 =
Zero(dh);
93template <
typename T,
typename T2>
95 const Half<
decltype(
d)> dh;
97 ret.
v0 = ret.v1 =
Set(dh,
static_cast<T
>(t));
103 const Half<
decltype(
d)> dh;
109template <
typename T,
typename T2>
111 const Half<
decltype(
d)> dh;
116 static_cast<T
>(first),
Lanes(dh)));
174template <
int kBits,
typename T>
181template <
int kBits,
typename T>
189template <
int kBits,
typename T>
191 constexpr size_t kSizeInBits =
sizeof(T) * 8;
192 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
193 if (kBits == 0)
return v;
217 a.v1 =
Min(
a.v1, b.v1);
224 a.v1 =
Max(
a.v1, b.v1);
312 return mul *
x + add;
319 return add - mul *
x;
327 return mul *
x - sub;
334 return Neg(mul) *
x - sub;
350 return one /
Sqrt(
v);
390template <
typename T, HWY_IF_FLOAT(T)>
400template <
typename T, HWY_IF_FLOAT(T)>
409 const VFromD<
decltype(di)> exp =
418template <
typename TFrom,
typename TTo>
420 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
427 return (
v & bit) == bit;
498 a.v1 =
And(
a.v1, b.v1);
504 not_mask.v0 =
AndNot(not_mask.v0, mask.v0);
505 not_mask.v1 =
AndNot(not_mask.v1, mask.v1);
512 a.v1 =
Or(
a.v1, b.v1);
519 a.v1 =
Xor(
a.v1, b.v1);
530 return Or(o1,
Or(o2, o3));
535 return Or(o,
And(a1, a2));
564 static_assert(
IsFloat<T>(),
"Only makes sense for floating-point");
571 static_assert(
IsFloat<T>(),
"Only makes sense for floating-point");
588 const Half<
decltype(
d)> dh;
622template <
typename T, HWY_IF_FLOAT(T)>
682template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
697 const Half<
decltype(
d)> dh;
699 ret.
v0 =
Load(dh, aligned);
718 const Half<
decltype(
d)> dh;
720 ret.
v0 = ret.v1 =
Load(dh, p);
728 const Half<
decltype(
d)> dh;
754template <
typename T,
typename Offset>
757 constexpr size_t N = 32 /
sizeof(T);
758 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
760 alignas(32) T lanes[
N];
763 alignas(32) Offset offset_lanes[
N];
766 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
767 for (
size_t i = 0; i <
N; ++i) {
772template <
typename T,
typename Index>
775 constexpr size_t N = 32 /
sizeof(T);
776 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
778 alignas(32) T lanes[
N];
781 alignas(32) Index index_lanes[
N];
784 for (
size_t i = 0; i <
N; ++i) {
785 base[index_lanes[i]] = lanes[i];
791template <
typename T,
typename Offset>
794 constexpr size_t N = 32 /
sizeof(T);
795 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
797 alignas(32) Offset offset_lanes[
N];
800 alignas(32) T lanes[
N];
801 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
802 for (
size_t i = 0; i <
N; ++i) {
805 return Load(
d, lanes);
808template <
typename T,
typename Index>
811 constexpr size_t N = 32 /
sizeof(T);
812 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
814 alignas(32) Index index_lanes[
N];
817 alignas(32) T lanes[
N];
818 for (
size_t i = 0; i <
N; ++i) {
819 lanes[i] = base[index_lanes[i]];
821 return Load(
d, lanes);
829 alignas(32) T lanes[32 /
sizeof(T)];
838 alignas(32) T lanes[32 /
sizeof(T)];
841 return Load(
d, lanes);
864template <
int kBytes,
typename T>
866 const Half<
decltype(
d)> dh;
872template <
int kBytes,
typename T>
879template <
int kLanes,
typename T>
885template <
int kLanes,
typename T>
891template <
int kBytes,
typename T>
893 const Half<
decltype(
d)> dh;
900template <
int kLanes,
typename T>
915template <
int kBytes,
typename T,
class V = Vec256<T>>
917 const Half<
decltype(
d)> dh;
925template <
int kLane,
typename T>
936template <
typename T,
typename TI>
944template <
typename T,
typename TI,
size_t NI>
955template <
typename T,
size_t N,
typename TI>
965template <
class V,
class VI>
1001template <
typename T>
1008template <
typename T>
1018template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1024template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1030template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1042template <
typename T>
1048template <
typename T,
typename TI>
1050 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
1052 ret.i0 = vec.v0.raw;
1053 ret.i1 = vec.v1.raw;
1057template <
typename T,
typename TI>
1059 const Rebind<TI,
decltype(
d)> di;
1063template <
typename T>
1068 constexpr size_t kLanesPerHalf = 16 /
sizeof(TU);
1072 const Vec128<TU> mask =
Set(duh,
static_cast<TU
>(kLanesPerHalf - 1));
1089template <
typename T>
1096template <
typename T>
1098 const Half<
decltype(
d)> dh;
1106template <
typename T>
1108 const Half<
decltype(
d)> dh;
1117template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1119 const Half<
decltype(
d)> dh;
1126template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
1128 const Half<
decltype(
d)> dh;
1136template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1142template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1144 const Half<
decltype(
d)> dh;
1151template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
1153 const Half<
decltype(
d)> dh;
1161template <
typename T>
1172template <
typename T,
class V = Vec256<T>>
1174 const Half<
decltype(
d)> dh;
1184template <
typename T,
class DW = RepartitionToW
ide<Full256<T>>>
1188template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1193template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1201template <
typename T>
1210template <
typename T>
1212 const Half<
decltype(
d)> dh;
1217template <
typename T>
1227template <
typename T>
1237template <
typename T>
1247template <
typename T>
1257template <
typename T>
1260 const Half<
decltype(
d)> dh;
1268template <
typename T>
1271 const Half<
decltype(
d)> dh;
1279template <
typename T>
1287template <
typename T>
1295template <
typename T>
1303template <
typename T>
1310template <
typename T>
1319template <
typename T>
1338 wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(
v.raw))};
1347 wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(
v.raw))};
1370 wasm_i32x4_extend_high_i16x8(wasm_i16x8_extend_high_i8x16(
v.raw))};
1396 const auto mantissa = bits16 &
Set(du32, 0x3FF);
1397 const auto subnormal =
1399 Set(df32, 1.0f / 16384 / 1024));
1401 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
1402 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
1403 const auto normal =
ShiftLeft<23>(biased_exp32) | mantissa32;
1404 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
1417template <
typename T,
typename TN>
1419 const Half<
decltype(
d)> dh;
1427template <
typename TW,
typename TN>
1429 const Half<
decltype(
d)> dh;
1452 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.v0.raw,
v.v1.raw);
1453 return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
1463 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.v0.raw,
v.v1.raw);
1464 return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
1480 const Half<
decltype(d16)> d16h;
1488 const Half<
decltype(dbf16)> dbf16h;
1491 return Combine(dbf16, hi, lo);
1505 return Vec32<uint8_t>{wasm_i8x16_shuffle(
v.v0.raw,
v.v1.raw, 0, 8, 16, 24, 0,
1506 8, 16, 24, 0, 8, 16, 24, 0, 8, 16,
1513 17, 24, 25, 0, 1, 8, 9, 16, 17, 24,
1520 9, 10, 11, 16, 17, 18, 19, 24, 25,
1526 return Vec64<uint8_t>{wasm_i8x16_shuffle(
v.v0.raw,
v.v1.raw, 0, 4, 8, 12, 16,
1527 20, 24, 28, 0, 4, 8, 12, 16, 20, 24,
1534 9, 12, 13, 16, 17, 20, 21, 24, 25,
1541 10, 12, 14, 16, 18, 20, 22, 24, 26,
1554 const Half<
decltype(d16)> d16h;
1563template <
typename TTo,
typename TFrom>
1565 const Half<
decltype(
d)> dh;
1581template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
1584 const Half<
decltype(
d)> dh;
1589 constexpr size_t kBitsPerHalf = 16 /
sizeof(T);
1590 const uint8_t bits_upper[8] = {
static_cast<uint8_t
>(
bits[0] >> kBitsPerHalf)};
1595template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
1598 const Half<
decltype(
d)> dh;
1601 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
1602 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
1603 static_assert(kBytesPerHalf != 0,
"Lane size <= 16 bits => at least 8 lanes");
1611template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
1614 const Half<
decltype(
d)> dh;
1616 const uint8_t lo =
bits[0];
1620 constexpr size_t kBitsPerHalf = 16 /
sizeof(T);
1621 bits[0] =
static_cast<uint8_t
>(lo | (
bits[0] << kBitsPerHalf));
1622 return (kBitsPerHalf * 2 + 7) / 8;
1625template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
1628 const Half<
decltype(
d)> dh;
1629 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
1630 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
1631 static_assert(kBytesPerHalf != 0,
"Lane size <= 16 bits => at least 8 lanes");
1634 return kBytesPerHalf * 2;
1637template <
typename T>
1639 const Half<
decltype(
d)> dh;
1643template <
typename T>
1645 const Half<
decltype(
d)> dh;
1649template <
typename T>
1651 const Half<
decltype(
d)> dh;
1655template <
typename T>
1657 const Half<
decltype(
d)> dh;
1659 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
1660 return lo >= 0 ?
static_cast<size_t>(lo)
1664template <
typename T>
1666 const Half<
decltype(
d)> dh;
1669 if (lo < 0 && hi < 0)
return lo;
1670 constexpr int kLanesPerHalf = 16 /
sizeof(T);
1671 return lo >= 0 ? lo : hi + kLanesPerHalf;
1675template <
typename T>
1678 const Half<
decltype(
d)> dh;
1681 return count + count2;
1685template <
typename T>
1688 const Half<
decltype(
d)> dh;
1691 return count + count2;
1696template <
typename T>
1706template <
typename T>
1709 alignas(32) T lanes[32 /
sizeof(T)] = {};
1711 return Load(
d, lanes);
1715template <
typename T>
1731template <
typename T>
1751template <
typename T>
1755 constexpr size_t N = 32 /
sizeof(T);
1775template <
typename T>
1780 constexpr size_t N = 32 /
sizeof(T);
1806template <
typename T>
1810 constexpr size_t N = 32 /
sizeof(T);
1813 StoreU(out0,
d, unaligned + 0 *
N);
1814 StoreU(out1,
d, unaligned + 1 *
N);
1825template <
typename T>
1829 constexpr size_t N = 32 /
sizeof(T);
1833 StoreU(out0,
d, unaligned + 0 *
N);
1834 StoreU(out1,
d, unaligned + 1 *
N);
1835 StoreU(out2,
d, unaligned + 2 *
N);
1848template <
typename T>
1852 constexpr size_t N = 32 /
sizeof(T);
1856 StoreU(out0,
d, unaligned + 0 *
N);
1857 StoreU(out1,
d, unaligned + 1 *
N);
1860 StoreU(out2,
d, unaligned + 2 *
N);
1861 StoreU(out3,
d, unaligned + 3 *
N);
1867template <
typename TN,
typename TW>
1871 const Half<
decltype(
d)> dh;
1878template <
typename TW>
1887template <
typename T>
1889 const Half<
decltype(
d)> dh;
1894template <
typename T>
1896 const Half<
decltype(
d)> dh;
1901template <
typename T>
1903 const Half<
decltype(
d)> dh;
1910template <
typename T>
1912 const Half<
decltype(
d)> dh;
1915 ret.m1 =
Lt128(dh,
a.v1, b.v1);
1919template <
typename T>
1921 const Half<
decltype(
d)> dh;
1928template <
typename T>
1930 const Half<
decltype(
d)> dh;
1933 ret.m1 =
Eq128(dh,
a.v1, b.v1);
1937template <
typename T>
1939 const Half<
decltype(
d)> dh;
1946template <
typename T>
1948 const Half<
decltype(
d)> dh;
1951 ret.m1 =
Ne128(dh,
a.v1, b.v1);
1955template <
typename T>
1957 const Half<
decltype(
d)> dh;
1964template <
typename T>
1966 const Half<
decltype(
d)> dh;
1969 ret.v1 =
Min128(dh,
a.v1, b.v1);
1973template <
typename T>
1975 const Half<
decltype(
d)> dh;
1978 ret.v1 =
Max128(dh,
a.v1, b.v1);
1982template <
typename T>
1984 const Half<
decltype(
d)> dh;
1991template <
typename T>
1993 const Half<
decltype(
d)> dh;
size_t offset
Definition BitIO.h:80
uint32_t x
Definition BlockExec.h:38
uint8_t * bits
Definition TileProcessor.h:59
#define HWY_RESTRICT
Definition base.h:64
#define HWY_API
Definition base.h:129
#define HWY_INLINE
Definition base.h:70
#define HWY_ASSERT(condition)
Definition base.h:192
Definition x86_128-inl.h:137
Definition x86_128-inl.h:70
Raw raw
Definition arm_neon-inl.h:814
Definition x86_256-inl.h:82
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition wasm_256-inl.h:52
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition wasm_256-inl.h:46
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition wasm_256-inl.h:43
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition wasm_256-inl.h:40
Vec128< T > v1
Definition wasm_256-inl.h:57
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition wasm_256-inl.h:49
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition wasm_256-inl.h:37
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
Vec128< T > v0
Definition wasm_256-inl.h:56
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition wasm_256-inl.h:34
T PrivateT
Definition wasm_256-inl.h:29
uint32_t a
only used by MQ decoder
Definition mqc.h:48
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2413
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:470
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:862
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition arm_sve-inl.h:1299
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:505
HWY_API Vec128< ToT, N > ConvertTo(hwy::FloatTag, Simd< ToT, N, 0 >, Vec128< FromT, N > from)
Definition emu128-inl.h:1685
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition generic_ops-inl.h:340
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Vec256< T > TableLookupLanesOr0(Vec256< T > v, Indices256< T > idx)
Definition wasm_256-inl.h:1090
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
Simd< T, 32/sizeof(T), 0 > Full256
Definition wasm_128-inl.h:54
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
constexpr MakeSigned< T > MaxExponentTimes2()
Definition base.h:728
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:627
constexpr int MantissaBits()
Definition base.h:712
constexpr MakeSigned< T > MaxExponentField()
Definition base.h:778
HWY_API constexpr bool IsFloat()
Definition base.h:635
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
Definition x86_128-inl.h:4130
Definition x86_256-inl.h:3022
__v128_u i0
Definition wasm_256-inl.h:1044
__v128_u i1
Definition wasm_256-inl.h:1045
Definition x86_256-inl.h:143
Mask128< T > m1
Definition wasm_256-inl.h:63
Mask128< T > m0
Definition wasm_256-inl.h:62
Definition ops/shared-inl.h:52
uint32_t x1
Definition t1_common.h:75