Grok 10.0.5
x86_256-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when
17// compiling for that target.
18// External include guard in highway.h - see comment there.
19
20// WARNING: most operations do not cross 128-bit block boundaries. In
21// particular, "Broadcast", pack and zip behavior may be surprising.
22
23// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
24#include "hwy/base.h"
25
26// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
27// https://github.com/google/highway/issues/710)
29#if HWY_COMPILER_GCC_ACTUAL
30HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
31HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
32#endif
33
34// Must come before HWY_COMPILER_CLANGCL
35#include <immintrin.h> // AVX2+
36
37#if HWY_COMPILER_CLANGCL
38// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
39// including these headers when _MSC_VER is defined, like when using clang-cl.
40// Include these directly here.
41#include <avxintrin.h>
42// avxintrin defines __m256i and must come before avx2intrin.
43#include <avx2intrin.h>
44#include <bmi2intrin.h> // _pext_u64
45#include <f16cintrin.h>
46#include <fmaintrin.h>
47#include <smmintrin.h>
48#endif // HWY_COMPILER_CLANGCL
49
50#include <stddef.h>
51#include <stdint.h>
52#include <string.h> // memcpy
53
54#if HWY_IS_MSAN
55#include <sanitizer/msan_interface.h>
56#endif
57
58// For half-width vectors. Already includes base.h and shared-inl.h.
59#include "hwy/ops/x86_128-inl.h"
60
62namespace hwy {
63namespace HWY_NAMESPACE {
64namespace detail {
65
66template <typename T>
67struct Raw256 {
68 using type = __m256i;
69};
70template <>
71struct Raw256<float> {
72 using type = __m256;
73};
74template <>
75struct Raw256<double> {
76 using type = __m256d;
77};
78
79} // namespace detail
80
81template <typename T>
82class Vec256 {
83 using Raw = typename detail::Raw256<T>::type;
84
85 public:
86 using PrivateT = T; // only for DFromV
87 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV
88
89 // Compound assignment. Only usable if there is a corresponding non-member
90 // binary operator overload. For example, only f32 and f64 support division.
92 return *this = (*this * other);
93 }
95 return *this = (*this / other);
96 }
98 return *this = (*this + other);
99 }
101 return *this = (*this - other);
102 }
104 return *this = (*this & other);
105 }
107 return *this = (*this | other);
108 }
110 return *this = (*this ^ other);
111 }
112
114};
115
116#if HWY_TARGET <= HWY_AVX3
117
118namespace detail {
119
120// Template arg: sizeof(lane type)
121template <size_t size>
122struct RawMask256 {};
123template <>
124struct RawMask256<1> {
125 using type = __mmask32;
126};
127template <>
128struct RawMask256<2> {
129 using type = __mmask16;
130};
131template <>
132struct RawMask256<4> {
133 using type = __mmask8;
134};
135template <>
136struct RawMask256<8> {
137 using type = __mmask8;
138};
139
140} // namespace detail
141
142template <typename T>
143struct Mask256 {
144 using Raw = typename detail::RawMask256<sizeof(T)>::type;
145
146 static Mask256<T> FromBits(uint64_t mask_bits) {
147 return Mask256<T>{static_cast<Raw>(mask_bits)};
148 }
149
151};
152
153#else // AVX2
154
155// FF..FF or 0.
156template <typename T>
157struct Mask256 {
159};
160
161#endif // HWY_TARGET <= HWY_AVX3
162
163template <typename T>
164using Full256 = Simd<T, 32 / sizeof(T), 0>;
165
166// ------------------------------ BitCast
167
168namespace detail {
169
170HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
171HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); }
172HWY_INLINE __m256i BitCastToInteger(__m256d v) {
173 return _mm256_castpd_si256(v);
174}
175
176template <typename T>
180
181// Cannot rely on function overloading because return types differ.
182template <typename T>
184 HWY_INLINE __m256i operator()(__m256i v) { return v; }
185};
186template <>
188 HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); }
189};
190template <>
191struct BitCastFromInteger256<double> {
192 HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); }
193};
194
195template <typename T>
199
200} // namespace detail
201
202template <typename T, typename FromT>
203HWY_API Vec256<T> BitCast(Full256<T> d, Vec256<FromT> v) {
205}
206
207// ------------------------------ Set
208
209// Returns an all-zero vector.
210template <typename T>
211HWY_API Vec256<T> Zero(Full256<T> /* tag */) {
212 return Vec256<T>{_mm256_setzero_si256()};
213}
215 return Vec256<float>{_mm256_setzero_ps()};
216}
218 return Vec256<double>{_mm256_setzero_pd()};
219}
220
221// Returns a vector with all lanes set to "t".
222HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
223 return Vec256<uint8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
224}
225HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
226 return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
227}
228HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
229 return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
230}
231HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
232 return Vec256<uint64_t>{
233 _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
234}
235HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
236 return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
237}
238HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
239 return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
240}
241HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
242 return Vec256<int32_t>{_mm256_set1_epi32(t)};
243}
244HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
245 return Vec256<int64_t>{
246 _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
247}
248HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
249 return Vec256<float>{_mm256_set1_ps(t)};
250}
251HWY_API Vec256<double> Set(Full256<double> /* tag */, const double t) {
252 return Vec256<double>{_mm256_set1_pd(t)};
253}
254
255HWY_DIAGNOSTICS(push)
256HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
257
258// Returns a vector with uninitialized elements.
259template <typename T>
260HWY_API Vec256<T> Undefined(Full256<T> /* tag */) {
261 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
262 // generate an XOR instruction.
263 return Vec256<T>{_mm256_undefined_si256()};
264}
266 return Vec256<float>{_mm256_undefined_ps()};
267}
269 return Vec256<double>{_mm256_undefined_pd()};
270}
271
273
274// ================================================== LOGICAL
275
276// ------------------------------ And
277
278template <typename T>
279HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
280 return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
281}
282
284 return Vec256<float>{_mm256_and_ps(a.raw, b.raw)};
285}
287 return Vec256<double>{_mm256_and_pd(a.raw, b.raw)};
288}
289
290// ------------------------------ AndNot
291
292// Returns ~not_mask & mask.
293template <typename T>
294HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
295 return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
296}
298 const Vec256<float> mask) {
299 return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
300}
302 const Vec256<double> mask) {
303 return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)};
304}
305
306// ------------------------------ Or
307
308template <typename T>
309HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
310 return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
311}
312
314 return Vec256<float>{_mm256_or_ps(a.raw, b.raw)};
315}
317 return Vec256<double>{_mm256_or_pd(a.raw, b.raw)};
318}
319
320// ------------------------------ Xor
321
322template <typename T>
323HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
324 return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
325}
326
328 return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)};
329}
331 return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)};
332}
333
334// ------------------------------ Not
335template <typename T>
336HWY_API Vec256<T> Not(const Vec256<T> v) {
337 using TU = MakeUnsigned<T>;
338#if HWY_TARGET <= HWY_AVX3
339 const __m256i vu = BitCast(Full256<TU>(), v).raw;
340 return BitCast(Full256<T>(),
341 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
342#else
343 return Xor(v, BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
344#endif
345}
346
347// ------------------------------ Xor3
348template <typename T>
349HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
350#if HWY_TARGET <= HWY_AVX3
351 const Full256<T> d;
352 const RebindToUnsigned<decltype(d)> du;
353 using VU = VFromD<decltype(du)>;
354 const __m256i ret = _mm256_ternarylogic_epi64(
355 BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
356 return BitCast(d, VU{ret});
357#else
358 return Xor(x1, Xor(x2, x3));
359#endif
360}
361
362// ------------------------------ Or3
363template <typename T>
364HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
365#if HWY_TARGET <= HWY_AVX3
366 const Full256<T> d;
367 const RebindToUnsigned<decltype(d)> du;
368 using VU = VFromD<decltype(du)>;
369 const __m256i ret = _mm256_ternarylogic_epi64(
370 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
371 return BitCast(d, VU{ret});
372#else
373 return Or(o1, Or(o2, o3));
374#endif
375}
376
377// ------------------------------ OrAnd
378template <typename T>
379HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
380#if HWY_TARGET <= HWY_AVX3
381 const Full256<T> d;
382 const RebindToUnsigned<decltype(d)> du;
383 using VU = VFromD<decltype(du)>;
384 const __m256i ret = _mm256_ternarylogic_epi64(
385 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
386 return BitCast(d, VU{ret});
387#else
388 return Or(o, And(a1, a2));
389#endif
390}
391
392// ------------------------------ IfVecThenElse
393template <typename T>
394HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
395#if HWY_TARGET <= HWY_AVX3
396 const Full256<T> d;
397 const RebindToUnsigned<decltype(d)> du;
398 using VU = VFromD<decltype(du)>;
399 return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
400 BitCast(du, yes).raw,
401 BitCast(du, no).raw, 0xCA)});
402#else
403 return IfThenElse(MaskFromVec(mask), yes, no);
404#endif
405}
406
407// ------------------------------ Operator overloads (internal-only if float)
408
409template <typename T>
410HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
411 return And(a, b);
412}
413
414template <typename T>
415HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
416 return Or(a, b);
417}
418
419template <typename T>
420HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
421 return Xor(a, b);
422}
423
424// ------------------------------ PopulationCount
425
426// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
427#if HWY_TARGET == HWY_AVX3_DL
428
429#ifdef HWY_NATIVE_POPCNT
430#undef HWY_NATIVE_POPCNT
431#else
432#define HWY_NATIVE_POPCNT
433#endif
434
435namespace detail {
436
437template <typename T>
439 return Vec256<T>{_mm256_popcnt_epi8(v.raw)};
440}
441template <typename T>
443 return Vec256<T>{_mm256_popcnt_epi16(v.raw)};
444}
445template <typename T>
447 return Vec256<T>{_mm256_popcnt_epi32(v.raw)};
448}
449template <typename T>
451 return Vec256<T>{_mm256_popcnt_epi64(v.raw)};
452}
453
454} // namespace detail
455
456template <typename T>
460
461#endif // HWY_TARGET == HWY_AVX3_DL
462
463// ================================================== SIGN
464
465// ------------------------------ CopySign
466
467template <typename T>
468HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
469 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
470
471 const Full256<T> d;
472 const auto msb = SignBit(d);
473
474#if HWY_TARGET <= HWY_AVX3
475 const Rebind<MakeUnsigned<T>, decltype(d)> du;
476 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
477 // 0 0 0 | 0
478 // 0 0 1 | 0
479 // 0 1 0 | 1
480 // 0 1 1 | 1
481 // 1 0 0 | 0
482 // 1 0 1 | 1
483 // 1 1 0 | 0
484 // 1 1 1 | 1
485 // The lane size does not matter because we are not using predication.
486 const __m256i out = _mm256_ternarylogic_epi32(
487 BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
488 return BitCast(d, decltype(Zero(du)){out});
489#else
490 return Or(AndNot(msb, magn), And(msb, sign));
491#endif
492}
493
494template <typename T>
495HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) {
496#if HWY_TARGET <= HWY_AVX3
497 // AVX3 can also handle abs < 0, so no extra action needed.
498 return CopySign(abs, sign);
499#else
500 return Or(abs, And(SignBit(Full256<T>()), sign));
501#endif
502}
503
504// ================================================== MASK
505
506#if HWY_TARGET <= HWY_AVX3
507
508// ------------------------------ IfThenElse
509
510// Returns mask ? b : a.
511
512namespace detail {
513
514// Templates for signed/unsigned integer of a particular size.
515template <typename T>
517 Vec256<T> yes, Vec256<T> no) {
518 return Vec256<T>{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
519}
520template <typename T>
522 Vec256<T> yes, Vec256<T> no) {
523 return Vec256<T>{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
524}
525template <typename T>
527 Vec256<T> yes, Vec256<T> no) {
528 return Vec256<T>{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
529}
530template <typename T>
532 Vec256<T> yes, Vec256<T> no) {
533 return Vec256<T>{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
534}
535
536} // namespace detail
537
538template <typename T>
539HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
540 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
541}
543 Vec256<float> no) {
544 return Vec256<float>{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)};
545}
547 Vec256<double> no) {
548 return Vec256<double>{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)};
549}
550
551namespace detail {
552
553template <typename T>
555 Vec256<T> yes) {
556 return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)};
557}
558template <typename T>
560 Vec256<T> yes) {
561 return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)};
562}
563template <typename T>
565 Vec256<T> yes) {
566 return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)};
567}
568template <typename T>
570 Vec256<T> yes) {
571 return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)};
572}
573
574} // namespace detail
575
576template <typename T>
577HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
578 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
579}
581 return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)};
582}
584 Vec256<double> yes) {
585 return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)};
586}
587
588namespace detail {
589
590template <typename T>
592 Vec256<T> no) {
593 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
594 return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
595}
596template <typename T>
598 Vec256<T> no) {
599 return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
600}
601template <typename T>
603 Vec256<T> no) {
604 return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
605}
606template <typename T>
608 Vec256<T> no) {
609 return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
610}
611
612} // namespace detail
613
614template <typename T>
615HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
616 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
617}
619 return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
620}
622 return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
623}
624
625template <typename T>
627 static_assert(IsSigned<T>(), "Only for float");
628 // AVX3 MaskFromVec only looks at the MSB
629 return IfThenZeroElse(MaskFromVec(v), v);
630}
631
632// ------------------------------ Mask logical
633
634namespace detail {
635
636template <typename T>
638 const Mask256<T> b) {
639#if HWY_COMPILER_HAS_MASK_INTRINSICS
640 return Mask256<T>{_kand_mask32(a.raw, b.raw)};
641#else
642 return Mask256<T>{static_cast<__mmask32>(a.raw & b.raw)};
643#endif
644}
645template <typename T>
647 const Mask256<T> b) {
648#if HWY_COMPILER_HAS_MASK_INTRINSICS
649 return Mask256<T>{_kand_mask16(a.raw, b.raw)};
650#else
651 return Mask256<T>{static_cast<__mmask16>(a.raw & b.raw)};
652#endif
653}
654template <typename T>
656 const Mask256<T> b) {
657#if HWY_COMPILER_HAS_MASK_INTRINSICS
658 return Mask256<T>{_kand_mask8(a.raw, b.raw)};
659#else
660 return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
661#endif
662}
663template <typename T>
665 const Mask256<T> b) {
666#if HWY_COMPILER_HAS_MASK_INTRINSICS
667 return Mask256<T>{_kand_mask8(a.raw, b.raw)};
668#else
669 return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
670#endif
671}
672
673template <typename T>
675 const Mask256<T> b) {
676#if HWY_COMPILER_HAS_MASK_INTRINSICS
677 return Mask256<T>{_kandn_mask32(a.raw, b.raw)};
678#else
679 return Mask256<T>{static_cast<__mmask32>(~a.raw & b.raw)};
680#endif
681}
682template <typename T>
684 const Mask256<T> b) {
685#if HWY_COMPILER_HAS_MASK_INTRINSICS
686 return Mask256<T>{_kandn_mask16(a.raw, b.raw)};
687#else
688 return Mask256<T>{static_cast<__mmask16>(~a.raw & b.raw)};
689#endif
690}
691template <typename T>
693 const Mask256<T> b) {
694#if HWY_COMPILER_HAS_MASK_INTRINSICS
695 return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
696#else
697 return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
698#endif
699}
700template <typename T>
702 const Mask256<T> b) {
703#if HWY_COMPILER_HAS_MASK_INTRINSICS
704 return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
705#else
706 return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
707#endif
708}
709
710template <typename T>
712 const Mask256<T> b) {
713#if HWY_COMPILER_HAS_MASK_INTRINSICS
714 return Mask256<T>{_kor_mask32(a.raw, b.raw)};
715#else
716 return Mask256<T>{static_cast<__mmask32>(a.raw | b.raw)};
717#endif
718}
719template <typename T>
721 const Mask256<T> b) {
722#if HWY_COMPILER_HAS_MASK_INTRINSICS
723 return Mask256<T>{_kor_mask16(a.raw, b.raw)};
724#else
725 return Mask256<T>{static_cast<__mmask16>(a.raw | b.raw)};
726#endif
727}
728template <typename T>
730 const Mask256<T> b) {
731#if HWY_COMPILER_HAS_MASK_INTRINSICS
732 return Mask256<T>{_kor_mask8(a.raw, b.raw)};
733#else
734 return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
735#endif
736}
737template <typename T>
739 const Mask256<T> b) {
740#if HWY_COMPILER_HAS_MASK_INTRINSICS
741 return Mask256<T>{_kor_mask8(a.raw, b.raw)};
742#else
743 return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
744#endif
745}
746
747template <typename T>
749 const Mask256<T> b) {
750#if HWY_COMPILER_HAS_MASK_INTRINSICS
751 return Mask256<T>{_kxor_mask32(a.raw, b.raw)};
752#else
753 return Mask256<T>{static_cast<__mmask32>(a.raw ^ b.raw)};
754#endif
755}
756template <typename T>
758 const Mask256<T> b) {
759#if HWY_COMPILER_HAS_MASK_INTRINSICS
760 return Mask256<T>{_kxor_mask16(a.raw, b.raw)};
761#else
762 return Mask256<T>{static_cast<__mmask16>(a.raw ^ b.raw)};
763#endif
764}
765template <typename T>
767 const Mask256<T> b) {
768#if HWY_COMPILER_HAS_MASK_INTRINSICS
769 return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
770#else
771 return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
772#endif
773}
774template <typename T>
776 const Mask256<T> b) {
777#if HWY_COMPILER_HAS_MASK_INTRINSICS
778 return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
779#else
780 return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
781#endif
782}
783
784template <typename T>
786 const Mask256<T> a, const Mask256<T> b) {
787#if HWY_COMPILER_HAS_MASK_INTRINSICS
788 return Mask256<T>{_kxnor_mask32(a.raw, b.raw)};
789#else
790 return Mask256<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
791#endif
792}
793template <typename T>
795 const Mask256<T> a, const Mask256<T> b) {
796#if HWY_COMPILER_HAS_MASK_INTRINSICS
797 return Mask256<T>{_kxnor_mask16(a.raw, b.raw)};
798#else
799 return Mask256<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
800#endif
801}
802template <typename T>
804 const Mask256<T> a, const Mask256<T> b) {
805#if HWY_COMPILER_HAS_MASK_INTRINSICS
806 return Mask256<T>{_kxnor_mask8(a.raw, b.raw)};
807#else
808 return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
809#endif
810}
811template <typename T>
813 const Mask256<T> a, const Mask256<T> b) {
814#if HWY_COMPILER_HAS_MASK_INTRINSICS
815 return Mask256<T>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
816#else
817 return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
818#endif
819}
820
821} // namespace detail
822
823template <typename T>
824HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
825 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
826}
827
828template <typename T>
829HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
830 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
831}
832
833template <typename T>
834HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
835 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
836}
837
838template <typename T>
839HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
840 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
841}
842
843template <typename T>
844HWY_API Mask256<T> Not(const Mask256<T> m) {
845 // Flip only the valid bits.
846 constexpr size_t N = 32 / sizeof(T);
847 return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
848}
849
850template <typename T>
851HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
852 return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
853}
854
855#else // AVX2
856
857// ------------------------------ Mask
858
859// Mask and Vec are the same (true = FF..FF).
860template <typename T>
861HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
862 return Mask256<T>{v.raw};
863}
864
865template <typename T>
866HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
867 return Vec256<T>{v.raw};
868}
869
870template <typename T>
871HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
872 return Vec256<T>{v.raw};
873}
874
875// ------------------------------ IfThenElse
876
877// mask ? yes : no
878template <typename T>
879HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
880 const Vec256<T> no) {
881 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
882}
883HWY_API Vec256<float> IfThenElse(const Mask256<float> mask,
884 const Vec256<float> yes,
885 const Vec256<float> no) {
886 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
887}
888HWY_API Vec256<double> IfThenElse(const Mask256<double> mask,
889 const Vec256<double> yes,
890 const Vec256<double> no) {
891 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
892}
893
894// mask ? yes : 0
895template <typename T>
896HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
897 return yes & VecFromMask(Full256<T>(), mask);
898}
899
900// mask ? 0 : no
901template <typename T>
902HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
903 return AndNot(VecFromMask(Full256<T>(), mask), no);
904}
905
906template <typename T>
907HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
908 static_assert(IsSigned<T>(), "Only for float");
909 const auto zero = Zero(Full256<T>());
910 // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
911 return IfThenElse(MaskFromVec(v), zero, v);
912}
913
914// ------------------------------ Mask logical
915
916template <typename T>
917HWY_API Mask256<T> Not(const Mask256<T> m) {
918 return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
919}
920
921template <typename T>
922HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
923 const Full256<T> d;
924 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
925}
926
927template <typename T>
928HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
929 const Full256<T> d;
930 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
931}
932
933template <typename T>
934HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
935 const Full256<T> d;
936 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
937}
938
939template <typename T>
940HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
941 const Full256<T> d;
942 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
943}
944
945template <typename T>
946HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
947 const Full256<T> d;
948 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
949}
950
951#endif // HWY_TARGET <= HWY_AVX3
952
953// ================================================== COMPARE
954
955#if HWY_TARGET <= HWY_AVX3
956
957// Comparisons set a mask bit to 1 if the condition is true, else 0.
958
959template <typename TFrom, typename TTo>
960HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
961 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
962 return Mask256<TTo>{m.raw};
963}
964
965namespace detail {
966
967template <typename T>
969 const Vec256<T> bit) {
970 return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)};
971}
972template <typename T>
974 const Vec256<T> bit) {
975 return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)};
976}
977template <typename T>
979 const Vec256<T> bit) {
980 return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)};
981}
982template <typename T>
984 const Vec256<T> bit) {
985 return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)};
986}
987
988} // namespace detail
989
990template <typename T>
991HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
992 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
993 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
994}
995
996// ------------------------------ Equality
997
998template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1000 return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)};
1001}
1002template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1003HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1004 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
1005}
1006template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1007HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1008 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
1009}
1010template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1011HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1012 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
1013}
1014
1016 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1017}
1018
1020 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1021}
1022
1023// ------------------------------ Inequality
1024
1025template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1027 return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)};
1028}
1029template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1030HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1031 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
1032}
1033template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1034HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1035 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
1036}
1037template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1038HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1039 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
1040}
1041
1043 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1044}
1045
1047 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1048}
1049
1050// ------------------------------ Strict inequality
1051
1053 return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
1054}
1056 return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
1057}
1059 return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
1060}
1062 return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
1063}
1064
1066 return Mask256<uint8_t>{_mm256_cmpgt_epu8_mask(a.raw, b.raw)};
1067}
1069 const Vec256<uint16_t> b) {
1070 return Mask256<uint16_t>{_mm256_cmpgt_epu16_mask(a.raw, b.raw)};
1071}
1073 const Vec256<uint32_t> b) {
1074 return Mask256<uint32_t>{_mm256_cmpgt_epu32_mask(a.raw, b.raw)};
1075}
1077 const Vec256<uint64_t> b) {
1078 return Mask256<uint64_t>{_mm256_cmpgt_epu64_mask(a.raw, b.raw)};
1079}
1080
1082 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1083}
1085 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1086}
1087
1088// ------------------------------ Weak inequality
1089
1091 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1092}
1094 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1095}
1096
1097// ------------------------------ Mask
1098
1099namespace detail {
1100
1101template <typename T>
1103 return Mask256<T>{_mm256_movepi8_mask(v.raw)};
1104}
1105template <typename T>
1107 return Mask256<T>{_mm256_movepi16_mask(v.raw)};
1108}
1109template <typename T>
1111 return Mask256<T>{_mm256_movepi32_mask(v.raw)};
1112}
1113template <typename T>
1115 return Mask256<T>{_mm256_movepi64_mask(v.raw)};
1116}
1117
1118} // namespace detail
1119
1120template <typename T>
1121HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
1122 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1123}
1124// There do not seem to be native floating-point versions of these instructions.
1131
1132template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1134 return Vec256<T>{_mm256_movm_epi8(v.raw)};
1135}
1136
1137template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1138HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1139 return Vec256<T>{_mm256_movm_epi16(v.raw)};
1140}
1141
1142template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1143HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1144 return Vec256<T>{_mm256_movm_epi32(v.raw)};
1145}
1146
1147template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1148HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1149 return Vec256<T>{_mm256_movm_epi64(v.raw)};
1150}
1151
1153 return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))};
1154}
1155
1157 return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))};
1158}
1159
1160template <typename T>
1161HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
1162 return VecFromMask(v);
1163}
1164
1165#else // AVX2
1166
1167// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1168
1169template <typename TFrom, typename TTo>
1170HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
1171 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1172 return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
1173}
1174
1175template <typename T>
1176HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
1177 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1178 return (v & bit) == bit;
1179}
1180
1181// ------------------------------ Equality
1182
1183template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1184HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1185 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1186}
1187
1188template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1189HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1190 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1191}
1192
1193template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1194HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1195 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1196}
1197
1198template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1199HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1200 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1201}
1202
1203HWY_API Mask256<float> operator==(const Vec256<float> a,
1204 const Vec256<float> b) {
1205 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1206}
1207
1208HWY_API Mask256<double> operator==(const Vec256<double> a,
1209 const Vec256<double> b) {
1210 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1211}
1212
1213// ------------------------------ Inequality
1214
1215template <typename T>
1216HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1217 return Not(a == b);
1218}
1219HWY_API Mask256<float> operator!=(const Vec256<float> a,
1220 const Vec256<float> b) {
1221 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1222}
1223HWY_API Mask256<double> operator!=(const Vec256<double> a,
1224 const Vec256<double> b) {
1225 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1226}
1227
1228// ------------------------------ Strict inequality
1229
1230// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1231namespace detail {
1232
1233// Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
1234// to perform an unsigned comparison instead of the intended signed. Workaround
1235// is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
1236#if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1237#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1238#else
1239#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1240#endif
1241
1242HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a,
1243 Vec256<int8_t> b) {
1244#if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1245 using i8x32 = signed char __attribute__((__vector_size__(32)));
1246 return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
1247 reinterpret_cast<i8x32>(b.raw))};
1248#else
1249 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1250#endif
1251}
1252HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a,
1253 Vec256<int16_t> b) {
1254 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1255}
1256HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a,
1257 Vec256<int32_t> b) {
1258 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1259}
1260HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a,
1261 Vec256<int64_t> b) {
1262 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1263}
1264
1265template <typename T>
1266HWY_INLINE Mask256<T> Gt(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
1267 const Full256<T> du;
1268 const RebindToSigned<decltype(du)> di;
1269 const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1270 return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1271}
1272
1273HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a,
1274 Vec256<float> b) {
1275 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1276}
1277HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a,
1278 Vec256<double> b) {
1279 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1280}
1281
1282} // namespace detail
1283
1284template <typename T>
1285HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
1286 return detail::Gt(hwy::TypeTag<T>(), a, b);
1287}
1288
1289// ------------------------------ Weak inequality
1290
1291HWY_API Mask256<float> operator>=(const Vec256<float> a,
1292 const Vec256<float> b) {
1293 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1294}
1295HWY_API Mask256<double> operator>=(const Vec256<double> a,
1296 const Vec256<double> b) {
1297 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1298}
1299
1300#endif // HWY_TARGET <= HWY_AVX3
1301
1302// ------------------------------ Reversed comparisons
1303
1304template <typename T>
1305HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
1306 return b > a;
1307}
1308
1309template <typename T>
1310HWY_API Mask256<T> operator<=(const Vec256<T> a, const Vec256<T> b) {
1311 return b >= a;
1312}
1313
1314// ------------------------------ Min (Gt, IfThenElse)
1315
1316// Unsigned
1318 return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1319}
1321 const Vec256<uint16_t> b) {
1322 return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1323}
1325 const Vec256<uint32_t> b) {
1326 return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1327}
1329 const Vec256<uint64_t> b) {
1330#if HWY_TARGET <= HWY_AVX3
1331 return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1332#else
1333 const Full256<uint64_t> du;
1334 const Full256<int64_t> di;
1335 const auto msb = Set(du, 1ull << 63);
1336 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1337 return IfThenElse(gt, b, a);
1338#endif
1339}
1340
1341// Signed
1343 return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1344}
1346 return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1347}
1349 return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1350}
1352#if HWY_TARGET <= HWY_AVX3
1353 return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1354#else
1355 return IfThenElse(a < b, a, b);
1356#endif
1357}
1358
1359// Float
1361 return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1362}
1364 return Vec256<double>{_mm256_min_pd(a.raw, b.raw)};
1365}
1366
1367// ------------------------------ Max (Gt, IfThenElse)
1368
1369// Unsigned
1371 return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1372}
1374 const Vec256<uint16_t> b) {
1375 return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1376}
1378 const Vec256<uint32_t> b) {
1379 return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1380}
1382 const Vec256<uint64_t> b) {
1383#if HWY_TARGET <= HWY_AVX3
1384 return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1385#else
1386 const Full256<uint64_t> du;
1387 const Full256<int64_t> di;
1388 const auto msb = Set(du, 1ull << 63);
1389 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1390 return IfThenElse(gt, a, b);
1391#endif
1392}
1393
1394// Signed
1396 return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1397}
1399 return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1400}
1402 return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1403}
1405#if HWY_TARGET <= HWY_AVX3
1406 return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1407#else
1408 return IfThenElse(a < b, b, a);
1409#endif
1410}
1411
1412// Float
1414 return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1415}
1417 return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
1418}
1419
1420// ------------------------------ FirstN (Iota, Lt)
1421
1422template <typename T>
1423HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
1424#if HWY_TARGET <= HWY_AVX3
1425 (void)d;
1426 constexpr size_t N = 32 / sizeof(T);
1427#if HWY_ARCH_X86_64
1428 const uint64_t all = (1ull << N) - 1;
1429 // BZHI only looks at the lower 8 bits of n!
1430 return Mask256<T>::FromBits((n > 255) ? all : _bzhi_u64(all, n));
1431#else
1432 const uint32_t all = static_cast<uint32_t>((1ull << N) - 1);
1433 // BZHI only looks at the lower 8 bits of n!
1434 return Mask256<T>::FromBits(
1435 (n > 255) ? all : _bzhi_u32(all, static_cast<uint32_t>(n)));
1436#endif // HWY_ARCH_X86_64
1437#else
1438 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1439 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
1440#endif
1441}
1442
1443// ================================================== ARITHMETIC
1444
1445// ------------------------------ Addition
1446
1447// Unsigned
1449 const Vec256<uint8_t> b) {
1450 return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1451}
1453 const Vec256<uint16_t> b) {
1454 return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
1455}
1457 const Vec256<uint32_t> b) {
1458 return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
1459}
1461 const Vec256<uint64_t> b) {
1462 return Vec256<uint64_t>{_mm256_add_epi64(a.raw, b.raw)};
1463}
1464
1465// Signed
1467 const Vec256<int8_t> b) {
1468 return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1469}
1471 const Vec256<int16_t> b) {
1472 return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1473}
1475 const Vec256<int32_t> b) {
1476 return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1477}
1479 const Vec256<int64_t> b) {
1480 return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)};
1481}
1482
1483// Float
1485 return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1486}
1488 const Vec256<double> b) {
1489 return Vec256<double>{_mm256_add_pd(a.raw, b.raw)};
1490}
1491
1492// ------------------------------ Subtraction
1493
1494// Unsigned
1496 const Vec256<uint8_t> b) {
1497 return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1498}
1500 const Vec256<uint16_t> b) {
1501 return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1502}
1504 const Vec256<uint32_t> b) {
1505 return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1506}
1508 const Vec256<uint64_t> b) {
1509 return Vec256<uint64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1510}
1511
1512// Signed
1514 const Vec256<int8_t> b) {
1515 return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1516}
1518 const Vec256<int16_t> b) {
1519 return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1520}
1522 const Vec256<int32_t> b) {
1523 return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1524}
1526 const Vec256<int64_t> b) {
1527 return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1528}
1529
1530// Float
1532 return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1533}
1535 const Vec256<double> b) {
1536 return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1537}
1538
1539// ------------------------------ SumsOf8
1540HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
1541 return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
1542}
1543
1544// ------------------------------ SaturatedAdd
1545
1546// Returns a + b clamped to the destination range.
1547
1548// Unsigned
1550 const Vec256<uint8_t> b) {
1551 return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
1552}
1554 const Vec256<uint16_t> b) {
1555 return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
1556}
1557
1558// Signed
1560 const Vec256<int8_t> b) {
1561 return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
1562}
1564 const Vec256<int16_t> b) {
1565 return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1566}
1567
1568// ------------------------------ SaturatedSub
1569
1570// Returns a - b clamped to the destination range.
1571
1572// Unsigned
1574 const Vec256<uint8_t> b) {
1575 return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
1576}
1578 const Vec256<uint16_t> b) {
1579 return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
1580}
1581
1582// Signed
1584 const Vec256<int8_t> b) {
1585 return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
1586}
1588 const Vec256<int16_t> b) {
1589 return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1590}
1591
1592// ------------------------------ Average
1593
1594// Returns (a + b + 1) / 2
1595
1596// Unsigned
1598 const Vec256<uint8_t> b) {
1599 return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
1600}
1602 const Vec256<uint16_t> b) {
1603 return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
1604}
1605
1606// ------------------------------ Abs (Sub)
1607
1608// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1610#if HWY_COMPILER_MSVC
1611 // Workaround for incorrect codegen? (wrong result)
1612 const auto zero = Zero(Full256<int8_t>());
1613 return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
1614#else
1615 return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
1616#endif
1617}
1619 return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
1620}
1622 return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
1623}
1624// i64 is implemented after BroadcastSignBit.
1625
1627 const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
1628 return v & BitCast(Full256<float>(), mask);
1629}
1631 const Vec256<int64_t> mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
1632 return v & BitCast(Full256<double>(), mask);
1633}
1634
1635// ------------------------------ Integer multiplication
1636
1637// Unsigned
1639 return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1640}
1642 return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1643}
1644
1645// Signed
1647 return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1648}
1650 return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1651}
1652
1653// Returns the upper 16 bits of a * b in each lane.
1655 return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
1656}
1658 return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
1659}
1660
1662 return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
1663}
1664
1665// Multiplies even lanes (0, 2 ..) and places the double-wide result into
1666// even and the upper half into its odd neighbor lane.
1667HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1668 return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1669}
1670HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1671 return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1672}
1673
1674// ------------------------------ ShiftLeft
1675
1676template <int kBits>
1678 return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)};
1679}
1680
1681template <int kBits>
1683 return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)};
1684}
1685
1686template <int kBits>
1688 return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)};
1689}
1690
1691template <int kBits>
1693 return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)};
1694}
1695
1696template <int kBits>
1698 return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)};
1699}
1700
1701template <int kBits>
1703 return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
1704}
1705
1706template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
1708 const Full256<T> d8;
1709 const RepartitionToWide<decltype(d8)> d16;
1710 const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
1711 return kBits == 1
1712 ? (v + v)
1713 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1714}
1715
1716// ------------------------------ ShiftRight
1717
1718template <int kBits>
1720 return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)};
1721}
1722
1723template <int kBits>
1725 return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)};
1726}
1727
1728template <int kBits>
1730 return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)};
1731}
1732
1733template <int kBits>
1735 const Full256<uint8_t> d8;
1736 // Use raw instead of BitCast to support N=1.
1737 const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
1738 return shifted & Set(d8, 0xFF >> kBits);
1739}
1740
1741template <int kBits>
1743 return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
1744}
1745
1746template <int kBits>
1748 return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
1749}
1750
1751template <int kBits>
1753 const Full256<int8_t> di;
1754 const Full256<uint8_t> du;
1755 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1756 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1757 return (shifted ^ shifted_sign) - shifted_sign;
1758}
1759
1760// i64 is implemented after BroadcastSignBit.
1761
1762// ------------------------------ RotateRight
1763
1764template <int kBits>
1766 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1767#if HWY_TARGET <= HWY_AVX3
1768 return Vec256<uint32_t>{_mm256_ror_epi32(v.raw, kBits)};
1769#else
1770 if (kBits == 0) return v;
1771 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
1772#endif
1773}
1774
1775template <int kBits>
1777 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1778#if HWY_TARGET <= HWY_AVX3
1779 return Vec256<uint64_t>{_mm256_ror_epi64(v.raw, kBits)};
1780#else
1781 if (kBits == 0) return v;
1782 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
1783#endif
1784}
1785
1786// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1787
1788HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
1789 return VecFromMask(v < Zero(Full256<int8_t>()));
1790}
1791
1795
1799
1801#if HWY_TARGET == HWY_AVX2
1802 return VecFromMask(v < Zero(Full256<int64_t>()));
1803#else
1804 return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
1805#endif
1806}
1807
1808template <int kBits>
1810#if HWY_TARGET <= HWY_AVX3
1811 return Vec256<int64_t>{_mm256_srai_epi64(v.raw, kBits)};
1812#else
1813 const Full256<int64_t> di;
1814 const Full256<uint64_t> du;
1815 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1816 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
1817 return right | sign;
1818#endif
1819}
1820
1822#if HWY_TARGET <= HWY_AVX3
1823 return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
1824#else
1825 const auto zero = Zero(Full256<int64_t>());
1826 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1827#endif
1828}
1829
1830// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1832 Vec256<int8_t> no) {
1833 // int8: AVX2 IfThenElse only looks at the MSB.
1834 return IfThenElse(MaskFromVec(v), yes, no);
1835}
1836
1837template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1839 static_assert(IsSigned<T>(), "Only works for signed/float");
1840 const Full256<T> d;
1841 const RebindToSigned<decltype(d)> di;
1842
1843 // 16-bit: no native blendv, so copy sign to lower byte's MSB.
1844 v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1845 return IfThenElse(MaskFromVec(v), yes, no);
1846}
1847
1848template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1849HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
1850 static_assert(IsSigned<T>(), "Only works for signed/float");
1851 const Full256<T> d;
1852 const RebindToFloat<decltype(d)> df;
1853
1854 // 32/64-bit: use float IfThenElse, which only looks at the MSB.
1855 const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
1856 return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
1857}
1858
1859// ------------------------------ ShiftLeftSame
1860
1862 const int bits) {
1863 return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1864}
1866 const int bits) {
1867 return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1868}
1870 const int bits) {
1871 return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1872}
1873
1875 return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1876}
1877
1879 return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1880}
1881
1883 return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1884}
1885
1886template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1888 const Full256<T> d8;
1889 const RepartitionToWide<decltype(d8)> d16;
1890 const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
1891 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
1892}
1893
1894// ------------------------------ ShiftRightSame (BroadcastSignBit)
1895
1897 const int bits) {
1898 return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1899}
1901 const int bits) {
1902 return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1903}
1905 const int bits) {
1906 return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1907}
1908
1910 const Full256<uint8_t> d8;
1911 const RepartitionToWide<decltype(d8)> d16;
1912 const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
1913 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
1914}
1915
1917 const int bits) {
1918 return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1919}
1920
1922 const int bits) {
1923 return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1924}
1926 const int bits) {
1927#if HWY_TARGET <= HWY_AVX3
1928 return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1929#else
1930 const Full256<int64_t> di;
1931 const Full256<uint64_t> du;
1932 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1933 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
1934 return right | sign;
1935#endif
1936}
1937
1939 const Full256<int8_t> di;
1940 const Full256<uint8_t> du;
1941 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1942 const auto shifted_sign =
1943 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1944 return (shifted ^ shifted_sign) - shifted_sign;
1945}
1946
1947// ------------------------------ Neg (Xor, Sub)
1948
1949// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1950namespace detail {
1951
1952template <typename T>
1954 return Xor(v, SignBit(Full256<T>()));
1955}
1956
1957// Not floating-point
1958template <typename T>
1960 return Zero(Full256<T>()) - v;
1961}
1962
1963} // namespace detail
1964
1965template <typename T>
1966HWY_API Vec256<T> Neg(const Vec256<T> v) {
1967 return detail::Neg(hwy::IsFloatTag<T>(), v);
1968}
1969
1970// ------------------------------ Floating-point mul / div
1971
1973 return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
1974}
1976 const Vec256<double> b) {
1977 return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
1978}
1979
1981 return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
1982}
1984 const Vec256<double> b) {
1985 return Vec256<double>{_mm256_div_pd(a.raw, b.raw)};
1986}
1987
1988// Approximate reciprocal
1989HWY_API Vec256<float> ApproximateReciprocal(const Vec256<float> v) {
1990 return Vec256<float>{_mm256_rcp_ps(v.raw)};
1991}
1992
1993// Absolute value of difference.
1994HWY_API Vec256<float> AbsDiff(const Vec256<float> a, const Vec256<float> b) {
1995 return Abs(a - b);
1996}
1997
1998// ------------------------------ Floating-point multiply-add variants
1999
2000// Returns mul * x + add
2001HWY_API Vec256<float> MulAdd(const Vec256<float> mul, const Vec256<float> x,
2002 const Vec256<float> add) {
2003#ifdef HWY_DISABLE_BMI2_FMA
2004 return mul * x + add;
2005#else
2006 return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
2007#endif
2008}
2010 const Vec256<double> add) {
2011#ifdef HWY_DISABLE_BMI2_FMA
2012 return mul * x + add;
2013#else
2014 return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
2015#endif
2016}
2017
2018// Returns add - mul * x
2019HWY_API Vec256<float> NegMulAdd(const Vec256<float> mul, const Vec256<float> x,
2020 const Vec256<float> add) {
2021#ifdef HWY_DISABLE_BMI2_FMA
2022 return add - mul * x;
2023#else
2024 return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
2025#endif
2026}
2028 const Vec256<double> x,
2029 const Vec256<double> add) {
2030#ifdef HWY_DISABLE_BMI2_FMA
2031 return add - mul * x;
2032#else
2033 return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
2034#endif
2035}
2036
2037// Returns mul * x - sub
2038HWY_API Vec256<float> MulSub(const Vec256<float> mul, const Vec256<float> x,
2039 const Vec256<float> sub) {
2040#ifdef HWY_DISABLE_BMI2_FMA
2041 return mul * x - sub;
2042#else
2043 return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
2044#endif
2045}
2047 const Vec256<double> sub) {
2048#ifdef HWY_DISABLE_BMI2_FMA
2049 return mul * x - sub;
2050#else
2051 return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
2052#endif
2053}
2054
2055// Returns -mul * x - sub
2056HWY_API Vec256<float> NegMulSub(const Vec256<float> mul, const Vec256<float> x,
2057 const Vec256<float> sub) {
2058#ifdef HWY_DISABLE_BMI2_FMA
2059 return Neg(mul * x) - sub;
2060#else
2061 return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2062#endif
2063}
2065 const Vec256<double> x,
2066 const Vec256<double> sub) {
2067#ifdef HWY_DISABLE_BMI2_FMA
2068 return Neg(mul * x) - sub;
2069#else
2070 return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2071#endif
2072}
2073
2074// ------------------------------ Floating-point square root
2075
2076// Full precision square root
2078 return Vec256<float>{_mm256_sqrt_ps(v.raw)};
2079}
2081 return Vec256<double>{_mm256_sqrt_pd(v.raw)};
2082}
2083
2084// Approximate reciprocal square root
2085HWY_API Vec256<float> ApproximateReciprocalSqrt(const Vec256<float> v) {
2086 return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
2087}
2088
2089// ------------------------------ Floating-point rounding
2090
2091// Toward nearest integer, tie to even
2092HWY_API Vec256<float> Round(const Vec256<float> v) {
2093 return Vec256<float>{
2094 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2095}
2097 return Vec256<double>{
2098 _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2099}
2100
2101// Toward zero, aka truncate
2102HWY_API Vec256<float> Trunc(const Vec256<float> v) {
2103 return Vec256<float>{
2104 _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2105}
2107 return Vec256<double>{
2108 _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2109}
2110
2111// Toward +infinity, aka ceiling
2112HWY_API Vec256<float> Ceil(const Vec256<float> v) {
2113 return Vec256<float>{
2114 _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2115}
2117 return Vec256<double>{
2118 _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2119}
2120
2121// Toward -infinity, aka floor
2122HWY_API Vec256<float> Floor(const Vec256<float> v) {
2123 return Vec256<float>{
2124 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2125}
2127 return Vec256<double>{
2128 _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2129}
2130
2131// ------------------------------ Floating-point classification
2132
2134#if HWY_TARGET <= HWY_AVX3
2135 return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x81)};
2136#else
2137 return Mask256<float>{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)};
2138#endif
2139}
2141#if HWY_TARGET <= HWY_AVX3
2142 return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x81)};
2143#else
2144 return Mask256<double>{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)};
2145#endif
2146}
2147
2148#if HWY_TARGET <= HWY_AVX3
2149
2151 return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x18)};
2152}
2154 return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x18)};
2155}
2156
2158 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
2159 // and negate the mask.
2160 return Not(Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x99)});
2161}
2163 return Not(Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x99)});
2164}
2165
2166#else
2167
2168template <typename T>
2169HWY_API Mask256<T> IsInf(const Vec256<T> v) {
2170 static_assert(IsFloat<T>(), "Only for float");
2171 const Full256<T> d;
2172 const RebindToSigned<decltype(d)> di;
2173 const VFromD<decltype(di)> vi = BitCast(di, v);
2174 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
2175 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
2176}
2177
2178// Returns whether normal/subnormal/zero.
2179template <typename T>
2180HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
2181 static_assert(IsFloat<T>(), "Only for float");
2182 const Full256<T> d;
2183 const RebindToUnsigned<decltype(d)> du;
2184 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
2185 const VFromD<decltype(du)> vu = BitCast(du, v);
2186 // Shift left to clear the sign bit, then right so we can compare with the
2187 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
2188 // negative and non-negative floats would be greater). MSVC seems to generate
2189 // incorrect code if we instead add vu + vu.
2190 const VFromD<decltype(di)> exp =
2192 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
2193}
2194
2195#endif // HWY_TARGET <= HWY_AVX3
2196
2197// ================================================== MEMORY
2198
2199// ------------------------------ Load
2200
2201template <typename T>
2202HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
2203 return Vec256<T>{
2204 _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
2205}
2207 const float* HWY_RESTRICT aligned) {
2208 return Vec256<float>{_mm256_load_ps(aligned)};
2209}
2211 const double* HWY_RESTRICT aligned) {
2212 return Vec256<double>{_mm256_load_pd(aligned)};
2213}
2214
2215template <typename T>
2216HWY_API Vec256<T> LoadU(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2217 return Vec256<T>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
2218}
2220 const float* HWY_RESTRICT p) {
2221 return Vec256<float>{_mm256_loadu_ps(p)};
2222}
2224 const double* HWY_RESTRICT p) {
2225 return Vec256<double>{_mm256_loadu_pd(p)};
2226}
2227
2228// ------------------------------ MaskedLoad
2229
2230#if HWY_TARGET <= HWY_AVX3
2231
2232template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2234 const T* HWY_RESTRICT p) {
2235 return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, p)};
2236}
2237
2238template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2239HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2240 const T* HWY_RESTRICT p) {
2241 return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2242}
2243
2244template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2245HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2246 const T* HWY_RESTRICT p) {
2247 return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2248}
2249
2250template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2251HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2252 const T* HWY_RESTRICT p) {
2253 return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2254}
2255
2257 const float* HWY_RESTRICT p) {
2258 return Vec256<float>{_mm256_maskz_loadu_ps(m.raw, p)};
2259}
2260
2262 const double* HWY_RESTRICT p) {
2263 return Vec256<double>{_mm256_maskz_loadu_pd(m.raw, p)};
2264}
2265
2266#else // AVX2
2267
2268// There is no maskload_epi8/16, so blend instead.
2269template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2270HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2271 const T* HWY_RESTRICT p) {
2272 return IfThenElseZero(m, LoadU(d, p));
2273}
2274
2275template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2276HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2277 const T* HWY_RESTRICT p) {
2278 auto pi = reinterpret_cast<const int*>(p); // NOLINT
2279 return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2280}
2281
2282template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2283HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2284 const T* HWY_RESTRICT p) {
2285 auto pi = reinterpret_cast<const long long*>(p); // NOLINT
2286 return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2287}
2288
2289HWY_API Vec256<float> MaskedLoad(Mask256<float> m, Full256<float> d,
2290 const float* HWY_RESTRICT p) {
2291 const Vec256<int32_t> mi =
2292 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2293 return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2294}
2295
2296HWY_API Vec256<double> MaskedLoad(Mask256<double> m, Full256<double> d,
2297 const double* HWY_RESTRICT p) {
2298 const Vec256<int64_t> mi =
2299 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2300 return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2301}
2302
2303#endif
2304
2305// ------------------------------ LoadDup128
2306
2307// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2308// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2309template <typename T>
2310HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2311#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2312 // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
2313 // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
2314 // upper half undefined) is fine because we're overwriting that anyway.
2315 // This workaround seems in turn to generate incorrect code in MSVC 2022
2316 // (19.31), so use broadcastsi128 there.
2317 const __m128i v128 = LoadU(Full128<T>(), p).raw;
2318 return Vec256<T>{
2319 _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2320#else
2321 return Vec256<T>{_mm256_broadcastsi128_si256(LoadU(Full128<T>(), p).raw)};
2322#endif
2323}
2325 const float* const HWY_RESTRICT p) {
2326#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2327 const __m128 v128 = LoadU(Full128<float>(), p).raw;
2328 return Vec256<float>{
2329 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2330#else
2331 return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))};
2332#endif
2333}
2335 const double* const HWY_RESTRICT p) {
2336#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2337 const __m128d v128 = LoadU(Full128<double>(), p).raw;
2338 return Vec256<double>{
2339 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2340#else
2341 return Vec256<double>{
2342 _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))};
2343#endif
2344}
2345
2346// ------------------------------ Store
2347
2348template <typename T>
2349HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
2350 _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2351}
2353 float* HWY_RESTRICT aligned) {
2354 _mm256_store_ps(aligned, v.raw);
2355}
2357 double* HWY_RESTRICT aligned) {
2358 _mm256_store_pd(aligned, v.raw);
2359}
2360
2361template <typename T>
2362HWY_API void StoreU(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT p) {
2363 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
2364}
2366 float* HWY_RESTRICT p) {
2367 _mm256_storeu_ps(p, v.raw);
2368}
2370 double* HWY_RESTRICT p) {
2371 _mm256_storeu_pd(p, v.raw);
2372}
2373
2374// ------------------------------ BlendedStore
2375
2376#if HWY_TARGET <= HWY_AVX3
2377
2378template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2380 T* HWY_RESTRICT p) {
2381 _mm256_mask_storeu_epi8(p, m.raw, v.raw);
2382}
2383
2384template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2385HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2386 T* HWY_RESTRICT p) {
2387 _mm256_mask_storeu_epi16(p, m.raw, v.raw);
2388}
2389
2390template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2391HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2392 T* HWY_RESTRICT p) {
2393 _mm256_mask_storeu_epi32(p, m.raw, v.raw);
2394}
2395
2396template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2397HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2398 T* HWY_RESTRICT p) {
2399 _mm256_mask_storeu_epi64(p, m.raw, v.raw);
2400}
2401
2403 Full256<float> /* tag */, float* HWY_RESTRICT p) {
2404 _mm256_mask_storeu_ps(p, m.raw, v.raw);
2405}
2406
2408 Full256<double> /* tag */, double* HWY_RESTRICT p) {
2409 _mm256_mask_storeu_pd(p, m.raw, v.raw);
2410}
2411
2412#else // AVX2
2413
2414// Intel SDM says "No AC# reported for any mask bit combinations". However, AMD
2415// allows AC# if "Alignment checking enabled and: 256-bit memory operand not
2416// 32-byte aligned". Fortunately AC# is not enabled by default and requires both
2417// OS support (CR0) and the application to set rflags.AC. We assume these remain
2418// disabled because x86/x64 code and compiler output often contain misaligned
2419// scalar accesses, which would also fault.
2420//
2421// Caveat: these are slow on AMD Jaguar/Bulldozer.
2422
2423template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2424HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2425 T* HWY_RESTRICT p) {
2426 // There is no maskload_epi8/16. Blending is also unsafe because loading a
2427 // full vector that crosses the array end causes asan faults. Resort to scalar
2428 // code; the caller should instead use memcpy, assuming m is FirstN(d, n).
2429 const RebindToUnsigned<decltype(d)> du;
2430 using TU = TFromD<decltype(du)>;
2431 alignas(32) TU buf[32 / sizeof(T)];
2432 alignas(32) TU mask[32 / sizeof(T)];
2433 Store(BitCast(du, v), du, buf);
2434 Store(BitCast(du, VecFromMask(d, m)), du, mask);
2435 for (size_t i = 0; i < 32 / sizeof(T); ++i) {
2436 if (mask[i]) {
2437 CopySameSize(buf + i, p + i);
2438 }
2439 }
2440}
2441
2442template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2443HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2444 T* HWY_RESTRICT p) {
2445 auto pi = reinterpret_cast<int*>(p); // NOLINT
2446 _mm256_maskstore_epi32(pi, m.raw, v.raw);
2447}
2448
2449template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2450HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2451 T* HWY_RESTRICT p) {
2452 auto pi = reinterpret_cast<long long*>(p); // NOLINT
2453 _mm256_maskstore_epi64(pi, m.raw, v.raw);
2454}
2455
2456HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, Full256<float> d,
2457 float* HWY_RESTRICT p) {
2458 const Vec256<int32_t> mi =
2459 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2460 _mm256_maskstore_ps(p, mi.raw, v.raw);
2461}
2462
2463HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m,
2464 Full256<double> d, double* HWY_RESTRICT p) {
2465 const Vec256<int64_t> mi =
2466 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2467 _mm256_maskstore_pd(p, mi.raw, v.raw);
2468}
2469
2470#endif
2471
2472// ------------------------------ Non-temporal stores
2473
2474template <typename T>
2475HWY_API void Stream(Vec256<T> v, Full256<T> /* tag */,
2476 T* HWY_RESTRICT aligned) {
2477 _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2478}
2480 float* HWY_RESTRICT aligned) {
2481 _mm256_stream_ps(aligned, v.raw);
2482}
2484 double* HWY_RESTRICT aligned) {
2485 _mm256_stream_pd(aligned, v.raw);
2486}
2487
2488// ------------------------------ Scatter
2489
2490// Work around warnings in the intrinsic definitions (passing -1 as a mask).
2491HWY_DIAGNOSTICS(push)
2492HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2493
2494#if HWY_TARGET <= HWY_AVX3
2495namespace detail {
2496
2497template <typename T>
2499 Full256<T> /* tag */, T* HWY_RESTRICT base,
2500 const Vec256<int32_t> offset) {
2501 _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
2502}
2503template <typename T>
2505 Full256<T> /* tag */, T* HWY_RESTRICT base,
2506 const Vec256<int32_t> index) {
2507 _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
2508}
2509
2510template <typename T>
2512 Full256<T> /* tag */, T* HWY_RESTRICT base,
2513 const Vec256<int64_t> offset) {
2514 _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
2515}
2516template <typename T>
2518 Full256<T> /* tag */, T* HWY_RESTRICT base,
2519 const Vec256<int64_t> index) {
2520 _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
2521}
2522
2523} // namespace detail
2524
2525template <typename T, typename Offset>
2526HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2527 const Vec256<Offset> offset) {
2528 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2529 return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2530}
2531template <typename T, typename Index>
2532HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2533 const Vec256<Index> index) {
2534 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2535 return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2536}
2537
2539 float* HWY_RESTRICT base,
2540 const Vec256<int32_t> offset) {
2541 _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
2542}
2544 float* HWY_RESTRICT base,
2545 const Vec256<int32_t> index) {
2546 _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
2547}
2548
2550 double* HWY_RESTRICT base,
2551 const Vec256<int64_t> offset) {
2552 _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
2553}
2555 double* HWY_RESTRICT base,
2556 const Vec256<int64_t> index) {
2557 _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
2558}
2559
2560#else
2561
2562template <typename T, typename Offset>
2563HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2564 const Vec256<Offset> offset) {
2565 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2566
2567 constexpr size_t N = 32 / sizeof(T);
2568 alignas(32) T lanes[N];
2569 Store(v, d, lanes);
2570
2571 alignas(32) Offset offset_lanes[N];
2572 Store(offset, Full256<Offset>(), offset_lanes);
2573
2574 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2575 for (size_t i = 0; i < N; ++i) {
2576 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2577 }
2578}
2579
2580template <typename T, typename Index>
2581HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2582 const Vec256<Index> index) {
2583 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2584
2585 constexpr size_t N = 32 / sizeof(T);
2586 alignas(32) T lanes[N];
2587 Store(v, d, lanes);
2588
2589 alignas(32) Index index_lanes[N];
2590 Store(index, Full256<Index>(), index_lanes);
2591
2592 for (size_t i = 0; i < N; ++i) {
2593 base[index_lanes[i]] = lanes[i];
2594 }
2595}
2596
2597#endif
2598
2599// ------------------------------ Gather
2600
2601namespace detail {
2602
2603template <typename T>
2605 Full256<T> /* tag */,
2606 const T* HWY_RESTRICT base,
2607 const Vec256<int32_t> offset) {
2608 return Vec256<T>{_mm256_i32gather_epi32(
2609 reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
2610}
2611template <typename T>
2613 Full256<T> /* tag */,
2614 const T* HWY_RESTRICT base,
2615 const Vec256<int32_t> index) {
2616 return Vec256<T>{_mm256_i32gather_epi32(
2617 reinterpret_cast<const int32_t*>(base), index.raw, 4)};
2618}
2619
2620template <typename T>
2622 Full256<T> /* tag */,
2623 const T* HWY_RESTRICT base,
2624 const Vec256<int64_t> offset) {
2625 return Vec256<T>{_mm256_i64gather_epi64(
2626 reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
2627}
2628template <typename T>
2630 Full256<T> /* tag */,
2631 const T* HWY_RESTRICT base,
2632 const Vec256<int64_t> index) {
2633 return Vec256<T>{_mm256_i64gather_epi64(
2634 reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
2635}
2636
2637} // namespace detail
2638
2639template <typename T, typename Offset>
2640HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
2641 const Vec256<Offset> offset) {
2642 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2643 return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2644}
2645template <typename T, typename Index>
2646HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
2647 const Vec256<Index> index) {
2648 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2649 return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2650}
2651
2653 const float* HWY_RESTRICT base,
2654 const Vec256<int32_t> offset) {
2655 return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
2656}
2658 const float* HWY_RESTRICT base,
2659 const Vec256<int32_t> index) {
2660 return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
2661}
2662
2664 const double* HWY_RESTRICT base,
2665 const Vec256<int64_t> offset) {
2666 return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
2667}
2669 const double* HWY_RESTRICT base,
2670 const Vec256<int64_t> index) {
2671 return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
2672}
2673
2674HWY_DIAGNOSTICS(pop)
2675
2676// ================================================== SWIZZLE
2677
2678// ------------------------------ LowerHalf
2679
2680template <typename T>
2681HWY_API Vec128<T> LowerHalf(Full128<T> /* tag */, Vec256<T> v) {
2682 return Vec128<T>{_mm256_castsi256_si128(v.raw)};
2683}
2685 return Vec128<float>{_mm256_castps256_ps128(v.raw)};
2686}
2688 return Vec128<double>{_mm256_castpd256_pd128(v.raw)};
2689}
2690
2691template <typename T>
2692HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
2693 return LowerHalf(Full128<T>(), v);
2694}
2695
2696// ------------------------------ UpperHalf
2697
2698template <typename T>
2699HWY_API Vec128<T> UpperHalf(Full128<T> /* tag */, Vec256<T> v) {
2700 return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)};
2701}
2703 return Vec128<float>{_mm256_extractf128_ps(v.raw, 1)};
2704}
2706 return Vec128<double>{_mm256_extractf128_pd(v.raw, 1)};
2707}
2708
2709// ------------------------------ ExtractLane (Store)
2710template <typename T>
2711HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
2712 const Full256<T> d;
2713 HWY_DASSERT(i < Lanes(d));
2714 alignas(32) T lanes[32 / sizeof(T)];
2715 Store(v, d, lanes);
2716 return lanes[i];
2717}
2718
2719// ------------------------------ InsertLane (Store)
2720template <typename T>
2721HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
2722 const Full256<T> d;
2723 HWY_DASSERT(i < Lanes(d));
2724 alignas(64) T lanes[64 / sizeof(T)];
2725 Store(v, d, lanes);
2726 lanes[i] = t;
2727 return Load(d, lanes);
2728}
2729
2730// ------------------------------ GetLane (LowerHalf)
2731template <typename T>
2732HWY_API T GetLane(const Vec256<T> v) {
2733 return GetLane(LowerHalf(v));
2734}
2735
2736// ------------------------------ ZeroExtendVector
2737
2738// Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper
2739// bits undefined. Although it makes sense for them to be zero (VEX encoded
2740// 128-bit instructions zero the upper lanes to avoid large penalties), a
2741// compiler could decide to optimize out code that relies on this.
2742//
2743// The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
2744// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For
2745// older GCC, we can still obtain the desired code thanks to pattern
2746// recognition; note that the expensive insert instruction is not actually
2747// generated, see https://gcc.godbolt.org/z/1MKGaP.
2748
2749#if !defined(HWY_HAVE_ZEXT)
2750#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \
2751 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
2752 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
2753#define HWY_HAVE_ZEXT 1
2754#else
2755#define HWY_HAVE_ZEXT 0
2756#endif
2757#endif // defined(HWY_HAVE_ZEXT)
2758
2759template <typename T>
2760HWY_API Vec256<T> ZeroExtendVector(Full256<T> /* tag */, Vec128<T> lo) {
2761#if HWY_HAVE_ZEXT
2762return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2763#else
2764 return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2765#endif
2766}
2768 Vec128<float> lo) {
2769#if HWY_HAVE_ZEXT
2770 return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
2771#else
2772 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
2773#endif
2774}
2776 Vec128<double> lo) {
2777#if HWY_HAVE_ZEXT
2778 return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
2779#else
2780 return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
2781#endif
2782}
2783
2784// ------------------------------ Combine
2785
2786template <typename T>
2787HWY_API Vec256<T> Combine(Full256<T> d, Vec128<T> hi, Vec128<T> lo) {
2788 const auto lo256 = ZeroExtendVector(d, lo);
2789 return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2790}
2792 Vec128<float> lo) {
2793 const auto lo256 = ZeroExtendVector(d, lo);
2794 return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)};
2795}
2797 Vec128<double> lo) {
2798 const auto lo256 = ZeroExtendVector(d, lo);
2799 return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)};
2800}
2801
2802// ------------------------------ ShiftLeftBytes
2803
2804template <int kBytes, typename T>
2805HWY_API Vec256<T> ShiftLeftBytes(Full256<T> /* tag */, const Vec256<T> v) {
2806 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2807 // This is the same operation as _mm256_bslli_epi128.
2808 return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)};
2809}
2810
2811template <int kBytes, typename T>
2812HWY_API Vec256<T> ShiftLeftBytes(const Vec256<T> v) {
2814}
2815
2816// ------------------------------ ShiftLeftLanes
2817
2818template <int kLanes, typename T>
2819HWY_API Vec256<T> ShiftLeftLanes(Full256<T> d, const Vec256<T> v) {
2820 const Repartition<uint8_t, decltype(d)> d8;
2821 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2822}
2823
2824template <int kLanes, typename T>
2825HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) {
2827}
2828
2829// ------------------------------ ShiftRightBytes
2830
2831template <int kBytes, typename T>
2832HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
2833 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2834 // This is the same operation as _mm256_bsrli_epi128.
2835 return Vec256<T>{_mm256_srli_si256(v.raw, kBytes)};
2836}
2837
2838// ------------------------------ ShiftRightLanes
2839template <int kLanes, typename T>
2840HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
2841 const Repartition<uint8_t, decltype(d)> d8;
2842 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2843}
2844
2845// ------------------------------ CombineShiftRightBytes
2846
2847// Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
2848template <int kBytes, typename T, class V = Vec256<T>>
2850 const Repartition<uint8_t, decltype(d)> d8;
2851 return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
2852 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2853}
2854
2855// ------------------------------ Broadcast/splat any lane
2856
2857// Unsigned
2858template <int kLane>
2860 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2861 if (kLane < 4) {
2862 const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2863 return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
2864 } else {
2865 const __m256i hi =
2866 _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2867 return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
2868 }
2869}
2870template <int kLane>
2872 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2873 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2874}
2875template <int kLane>
2877 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2878 return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2879}
2880
2881// Signed
2882template <int kLane>
2884 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2885 if (kLane < 4) {
2886 const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2887 return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
2888 } else {
2889 const __m256i hi =
2890 _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2891 return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
2892 }
2893}
2894template <int kLane>
2896 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2897 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2898}
2899template <int kLane>
2901 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2902 return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2903}
2904
2905// Float
2906template <int kLane>
2908 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2909 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
2910}
2911template <int kLane>
2913 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2914 return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
2915}
2916
2917// ------------------------------ Hard-coded shuffles
2918
2919// Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2920// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2921// right (the previous least-significant lane is now most-significant =>
2922// 47650321). These could also be implemented via CombineShiftRightBytes but
2923// the shuffle_abcd notation is more convenient.
2924
2925// Swap 32-bit halves in 64-bit halves.
2926template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2928 return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0xB1)};
2929}
2931 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
2932}
2933
2934// Used by generic_ops-inl.h
2935namespace detail {
2936
2937template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2938HWY_API Vec256<T> Shuffle2301(const Vec256<T> a, const Vec256<T> b) {
2939 const Full256<T> d;
2940 const RebindToFloat<decltype(d)> df;
2941 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
2942 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2943 BitCast(df, b).raw, m)});
2944}
2945template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2946HWY_API Vec256<T> Shuffle1230(const Vec256<T> a, const Vec256<T> b) {
2947 const Full256<T> d;
2948 const RebindToFloat<decltype(d)> df;
2949 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
2950 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2951 BitCast(df, b).raw, m)});
2952}
2953template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2954HWY_API Vec256<T> Shuffle3012(const Vec256<T> a, const Vec256<T> b) {
2955 const Full256<T> d;
2956 const RebindToFloat<decltype(d)> df;
2957 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
2958 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2959 BitCast(df, b).raw, m)});
2960}
2961
2962} // namespace detail
2963
2964// Swap 64-bit halves
2966 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2967}
2969 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2970}
2972 // Shorter encoding than _mm256_permute_ps.
2973 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)};
2974}
2976 return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2977}
2979 return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2980}
2982 // Shorter encoding than _mm256_permute_pd.
2983 return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)};
2984}
2985
2986// Rotate right 32 bits
2988 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2989}
2991 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2992}
2994 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)};
2995}
2996// Rotate left 32 bits
2998 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2999}
3001 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
3002}
3004 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)};
3005}
3006
3007// Reverse
3009 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
3010}
3012 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
3013}
3015 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)};
3016}
3017
3018// ------------------------------ TableLookupLanes
3019
3020// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
3021template <typename T>
3023 __m256i raw;
3024};
3025
3026// Native 8x32 instruction: indices remain unchanged
3027template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 4)>
3029 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3030#if HWY_IS_DEBUG_BUILD
3031 const Full256<TI> di;
3032 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3033 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(32 / sizeof(T))))));
3034#endif
3035 return Indices256<T>{vec.raw};
3036}
3037
3038// 64-bit lanes: convert indices to 8x32 unless AVX3 is available
3039template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 8)>
3040HWY_API Indices256<T> IndicesFromVec(Full256<T> d, Vec256<TI> idx64) {
3041 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3042 const Rebind<TI, decltype(d)> di;
3043 (void)di; // potentially unused
3044#if HWY_IS_DEBUG_BUILD
3045 HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) &&
3046 AllTrue(di, Lt(idx64, Set(di, static_cast<TI>(32 / sizeof(T))))));
3047#endif
3048
3049#if HWY_TARGET <= HWY_AVX3
3050 (void)d;
3051 return Indices256<T>{idx64.raw};
3052#else
3053 const Repartition<float, decltype(d)> df; // 32-bit!
3054 // Replicate 64-bit index into upper 32 bits
3055 const Vec256<TI> dup =
3056 BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)});
3057 // For each idx64 i, idx32 are 2*i and 2*i+1.
3058 const Vec256<TI> idx32 = dup + dup + Set(di, TI(1) << 32);
3059 return Indices256<T>{idx32.raw};
3060#endif
3061}
3062
3063template <typename T, typename TI>
3064HWY_API Indices256<T> SetTableIndices(const Full256<T> d, const TI* idx) {
3065 const Rebind<TI, decltype(d)> di;
3066 return IndicesFromVec(d, LoadU(di, idx));
3067}
3068
3069template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3071 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
3072}
3073
3074template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3075HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
3076#if HWY_TARGET <= HWY_AVX3
3077 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
3078#else
3079 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
3080#endif
3081}
3082
3084 const Indices256<float> idx) {
3085 return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
3086}
3087
3089 const Indices256<double> idx) {
3090#if HWY_TARGET <= HWY_AVX3
3091 return Vec256<double>{_mm256_permutexvar_pd(idx.raw, v.raw)};
3092#else
3093 const Full256<double> df;
3094 const Full256<uint64_t> du;
3095 return BitCast(df, Vec256<uint64_t>{_mm256_permutevar8x32_epi32(
3096 BitCast(du, v).raw, idx.raw)});
3097#endif
3098}
3099
3100// ------------------------------ SwapAdjacentBlocks
3101
3102template <typename T>
3103HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
3104 return Vec256<T>{_mm256_permute2x128_si256(v.raw, v.raw, 0x01)};
3105}
3106
3108 return Vec256<float>{_mm256_permute2f128_ps(v.raw, v.raw, 0x01)};
3109}
3110
3112 return Vec256<double>{_mm256_permute2f128_pd(v.raw, v.raw, 0x01)};
3113}
3114
3115// ------------------------------ Reverse (RotateRight)
3116
3117template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3119 alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3120 return TableLookupLanes(v, SetTableIndices(d, kReverse));
3121}
3122
3123template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3124HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3125 alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0};
3126 return TableLookupLanes(v, SetTableIndices(d, kReverse));
3127}
3128
3129template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3130HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3131#if HWY_TARGET <= HWY_AVX3
3132 const RebindToSigned<decltype(d)> di;
3133 alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
3134 7, 6, 5, 4, 3, 2, 1, 0};
3135 const Vec256<int16_t> idx = Load(di, kReverse);
3136 return BitCast(d, Vec256<int16_t>{
3137 _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3138#else
3139 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3140 const Vec256<uint32_t> rev32 = Reverse(du32, BitCast(du32, v));
3141 return BitCast(d, RotateRight<16>(rev32));
3142#endif
3143}
3144
3145// ------------------------------ Reverse2
3146
3147template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3149 const Full256<uint32_t> du32;
3150 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
3151}
3152
3153template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3154HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
3155 return Shuffle2301(v);
3156}
3157
3158template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3159HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
3160 return Shuffle01(v);
3161}
3162
3163// ------------------------------ Reverse4 (SwapAdjacentBlocks)
3164
3165template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3166HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
3167#if HWY_TARGET <= HWY_AVX3
3168 const RebindToSigned<decltype(d)> di;
3169 alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
3170 11, 10, 9, 8, 15, 14, 13, 12};
3171 const Vec256<int16_t> idx = Load(di, kReverse4);
3172 return BitCast(d, Vec256<int16_t>{
3173 _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3174#else
3175 const RepartitionToWide<decltype(d)> dw;
3176 return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
3177#endif
3178}
3179
3180template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3181HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
3182 return Shuffle0123(v);
3183}
3184
3185template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3186HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
3187 // Could also use _mm256_permute4x64_epi64.
3189}
3190
3191// ------------------------------ Reverse8
3192
3193template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3194HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
3195#if HWY_TARGET <= HWY_AVX3
3196 const RebindToSigned<decltype(d)> di;
3197 alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3198 15, 14, 13, 12, 11, 10, 9, 8};
3199 const Vec256<int16_t> idx = Load(di, kReverse8);
3200 return BitCast(d, Vec256<int16_t>{
3201 _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3202#else
3203 const RepartitionToWide<decltype(d)> dw;
3204 return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
3205#endif
3206}
3207
3208template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3209HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
3210 return Reverse(d, v);
3211}
3212
3213template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3214HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
3215 HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes
3216}
3217
3218// ------------------------------ InterleaveLower
3219
3220// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3221// the least-significant lane) and "b". To concatenate two half-width integers
3222// into one, use ZipLower/Upper instead (also works with scalar).
3223
3225 const Vec256<uint8_t> b) {
3226 return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3227}
3229 const Vec256<uint16_t> b) {
3230 return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3231}
3233 const Vec256<uint32_t> b) {
3234 return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3235}
3237 const Vec256<uint64_t> b) {
3238 return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3239}
3240
3242 const Vec256<int8_t> b) {
3243 return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3244}
3246 const Vec256<int16_t> b) {
3247 return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3248}
3250 const Vec256<int32_t> b) {
3251 return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3252}
3254 const Vec256<int64_t> b) {
3255 return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3256}
3257
3259 const Vec256<float> b) {
3260 return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
3261}
3263 const Vec256<double> b) {
3264 return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
3265}
3266
3267// ------------------------------ InterleaveUpper
3268
3269// All functions inside detail lack the required D parameter.
3270namespace detail {
3271
3273 const Vec256<uint8_t> b) {
3274 return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3275}
3277 const Vec256<uint16_t> b) {
3278 return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3279}
3281 const Vec256<uint32_t> b) {
3282 return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3283}
3285 const Vec256<uint64_t> b) {
3286 return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3287}
3288
3290 const Vec256<int8_t> b) {
3291 return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3292}
3294 const Vec256<int16_t> b) {
3295 return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3296}
3298 const Vec256<int32_t> b) {
3299 return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3300}
3302 const Vec256<int64_t> b) {
3303 return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3304}
3305
3307 const Vec256<float> b) {
3308 return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
3309}
3311 const Vec256<double> b) {
3312 return Vec256<double>{_mm256_unpackhi_pd(a.raw, b.raw)};
3313}
3314
3315} // namespace detail
3316
3317template <typename T, class V = Vec256<T>>
3318HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
3319 return detail::InterleaveUpper(a, b);
3320}
3321
3322// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3323
3324// Same as Interleave*, except that the return lanes are double-width integers;
3325// this is necessary because the single-lane scalar cannot return two values.
3326template <typename T, typename TW = MakeWide<T>>
3330template <typename T, typename TW = MakeWide<T>>
3334
3335template <typename T, typename TW = MakeWide<T>>
3339
3340// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
3341
3342// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
3343// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
3344// extra cost) for LowerLower and UpperLower.
3345
3346// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3347template <typename T>
3348HWY_API Vec256<T> ConcatLowerLower(Full256<T> d, const Vec256<T> hi,
3349 const Vec256<T> lo) {
3350 const Half<decltype(d)> d2;
3351 return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
3352}
3354 const Vec256<float> lo) {
3355 const Half<decltype(d)> d2;
3356 return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
3357}
3359 const Vec256<double> hi,
3360 const Vec256<double> lo) {
3361 const Half<decltype(d)> d2;
3362 return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
3363}
3364
3365// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
3366template <typename T>
3367HWY_API Vec256<T> ConcatLowerUpper(Full256<T> /* tag */, const Vec256<T> hi,
3368 const Vec256<T> lo) {
3369 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3370}
3372 const Vec256<float> hi,
3373 const Vec256<float> lo) {
3374 return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
3375}
3377 const Vec256<double> hi,
3378 const Vec256<double> lo) {
3379 return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
3380}
3381
3382// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3383template <typename T>
3384HWY_API Vec256<T> ConcatUpperLower(Full256<T> /* tag */, const Vec256<T> hi,
3385 const Vec256<T> lo) {
3386 return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3387}
3389 const Vec256<float> hi,
3390 const Vec256<float> lo) {
3391 return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
3392}
3394 const Vec256<double> hi,
3395 const Vec256<double> lo) {
3396 return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
3397}
3398
3399// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3400template <typename T>
3401HWY_API Vec256<T> ConcatUpperUpper(Full256<T> /* tag */, const Vec256<T> hi,
3402 const Vec256<T> lo) {
3403 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
3404}
3406 const Vec256<float> hi,
3407 const Vec256<float> lo) {
3408 return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
3409}
3411 const Vec256<double> hi,
3412 const Vec256<double> lo) {
3413 return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
3414}
3415
3416// ------------------------------ ConcatOdd
3417
3418template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3420 const RebindToUnsigned<decltype(d)> du;
3421#if HWY_TARGET == HWY_AVX3_DL
3422 alignas(32) constexpr uint8_t kIdx[32] = {
3423 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
3424 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
3425 return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi8(
3426 BitCast(du, lo).raw, Load(du, kIdx).raw,
3427 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
3428#else
3429 const RepartitionToWide<decltype(du)> dw;
3430 // Unsigned 8-bit shift so we can pack.
3431 const Vec256<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
3432 const Vec256<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
3433 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3434 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3435#endif
3436}
3437
3438template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3439HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3440 const RebindToUnsigned<decltype(d)> du;
3441#if HWY_TARGET <= HWY_AVX3
3442 alignas(32) constexpr uint16_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
3443 17, 19, 21, 23, 25, 27, 29, 31};
3444 return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi16(
3445 BitCast(du, lo).raw, Load(du, kIdx).raw,
3446 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
3447#else
3448 const RepartitionToWide<decltype(du)> dw;
3449 // Unsigned 16-bit shift so we can pack.
3450 const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
3451 const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
3452 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3453 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3454#endif
3455}
3456
3457template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3458HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3459 const RebindToUnsigned<decltype(d)> du;
3460#if HWY_TARGET <= HWY_AVX3
3461 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3462 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3463 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3464 BitCast(du, hi).raw)});
3465#else
3466 const RebindToFloat<decltype(d)> df;
3467 const Vec256<float> v3131{_mm256_shuffle_ps(
3468 BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3469 return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw,
3470 _MM_SHUFFLE(3, 1, 2, 0))};
3471#endif
3472}
3473
3475 Vec256<float> lo) {
3476 const RebindToUnsigned<decltype(d)> du;
3477#if HWY_TARGET <= HWY_AVX3
3478 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3479 return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3480 __mmask8{0xFF}, hi.raw)};
3481#else
3482 const Vec256<float> v3131{
3483 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
3484 return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3485 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3486#endif
3487}
3488
3489template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3490HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3491 const RebindToUnsigned<decltype(d)> du;
3492#if HWY_TARGET <= HWY_AVX3
3493 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3494 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3495 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3496 BitCast(du, hi).raw)});
3497#else
3498 const RebindToFloat<decltype(d)> df;
3499 const Vec256<double> v31{
3500 _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)};
3501 return Vec256<T>{
3502 _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3503#endif
3504}
3505
3507 Vec256<double> lo) {
3508#if HWY_TARGET <= HWY_AVX3
3509 const RebindToUnsigned<decltype(d)> du;
3510 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3511 return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3512 __mmask8{0xFF}, hi.raw)};
3513#else
3514 (void)d;
3515 const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)};
3516 return Vec256<double>{
3517 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3518#endif
3519}
3520
3521// ------------------------------ ConcatEven
3522
3523template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3525 const RebindToUnsigned<decltype(d)> du;
3526#if HWY_TARGET == HWY_AVX3_DL
3527 alignas(64) constexpr uint8_t kIdx[32] = {
3528 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3529 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3530 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi8(
3531 BitCast(du, lo).raw, Load(du, kIdx).raw,
3532 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
3533#else
3534 const RepartitionToWide<decltype(du)> dw;
3535 // Isolate lower 8 bits per u16 so we can pack.
3536 const Vec256<uint16_t> mask = Set(dw, 0x00FF);
3537 const Vec256<uint16_t> uH = And(BitCast(dw, hi), mask);
3538 const Vec256<uint16_t> uL = And(BitCast(dw, lo), mask);
3539 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3540 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3541#endif
3542}
3543
3544template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3545HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3546 const RebindToUnsigned<decltype(d)> du;
3547#if HWY_TARGET <= HWY_AVX3
3548 alignas(64) constexpr uint16_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3549 16, 18, 20, 22, 24, 26, 28, 30};
3550 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi16(
3551 BitCast(du, lo).raw, Load(du, kIdx).raw,
3552 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
3553#else
3554 const RepartitionToWide<decltype(du)> dw;
3555 // Isolate lower 16 bits per u32 so we can pack.
3556 const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
3557 const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
3558 const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
3559 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3560 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3561#endif
3562}
3563
3564template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3565HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3566 const RebindToUnsigned<decltype(d)> du;
3567#if HWY_TARGET <= HWY_AVX3
3568 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3569 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3570 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3571 BitCast(du, hi).raw)});
3572#else
3573 const RebindToFloat<decltype(d)> df;
3574 const Vec256<float> v2020{_mm256_shuffle_ps(
3575 BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3576 return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
3577 _MM_SHUFFLE(3, 1, 2, 0))};
3578
3579#endif
3580}
3581
3583 Vec256<float> lo) {
3584 const RebindToUnsigned<decltype(d)> du;
3585#if HWY_TARGET <= HWY_AVX3
3586 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3587 return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3588 __mmask8{0xFF}, hi.raw)};
3589#else
3590 const Vec256<float> v2020{
3591 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
3592 return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3593 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3594
3595#endif
3596}
3597
3598template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3599HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3600 const RebindToUnsigned<decltype(d)> du;
3601#if HWY_TARGET <= HWY_AVX3
3602 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3603 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3604 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3605 BitCast(du, hi).raw)});
3606#else
3607 const RebindToFloat<decltype(d)> df;
3608 const Vec256<double> v20{
3609 _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
3610 return Vec256<T>{
3611 _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3612
3613#endif
3614}
3615
3617 Vec256<double> lo) {
3618#if HWY_TARGET <= HWY_AVX3
3619 const RebindToUnsigned<decltype(d)> du;
3620 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3621 return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3622 __mmask8{0xFF}, hi.raw)};
3623#else
3624 (void)d;
3625 const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
3626 return Vec256<double>{
3627 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3628#endif
3629}
3630
3631// ------------------------------ DupEven (InterleaveLower)
3632
3633template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3635 return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3636}
3638 return Vec256<float>{
3639 _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3640}
3641
3642template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3643HWY_API Vec256<T> DupEven(const Vec256<T> v) {
3644 return InterleaveLower(Full256<T>(), v, v);
3645}
3646
3647// ------------------------------ DupOdd (InterleaveUpper)
3648
3649template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3651 return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3652}
3654 return Vec256<float>{
3655 _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3656}
3657
3658template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3659HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
3660 return InterleaveUpper(Full256<T>(), v, v);
3661}
3662
3663// ------------------------------ OddEven
3664
3665namespace detail {
3666
3667template <typename T>
3669 const Vec256<T> b) {
3670 const Full256<T> d;
3671 const Full256<uint8_t> d8;
3672 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3673 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3674 return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
3675}
3676template <typename T>
3678 const Vec256<T> b) {
3679 return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
3680}
3681template <typename T>
3683 const Vec256<T> b) {
3684 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
3685}
3686template <typename T>
3688 const Vec256<T> b) {
3689 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
3690}
3691
3692} // namespace detail
3693
3694template <typename T>
3695HWY_API Vec256<T> OddEven(const Vec256<T> a, const Vec256<T> b) {
3696 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3697}
3699 return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
3700}
3701
3703 return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
3704}
3705
3706// ------------------------------ OddEvenBlocks
3707
3708template <typename T>
3710 return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)};
3711}
3712
3714 return Vec256<float>{_mm256_blend_ps(odd.raw, even.raw, 0xFu)};
3715}
3716
3718 return Vec256<double>{_mm256_blend_pd(odd.raw, even.raw, 0x3u)};
3719}
3720
3721// ------------------------------ ReverseBlocks (ConcatLowerUpper)
3722
3723template <typename T>
3724HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
3725 return ConcatLowerUpper(d, v, v);
3726}
3727
3728// ------------------------------ TableLookupBytes (ZeroExtendVector)
3729
3730// Both full
3731template <typename T, typename TI>
3732HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes,
3733 const Vec256<TI> from) {
3734 return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3735}
3736
3737// Partial index vector
3738template <typename T, typename TI, size_t NI>
3739HWY_API Vec128<TI, NI> TableLookupBytes(const Vec256<T> bytes,
3740 const Vec128<TI, NI> from) {
3741 // First expand to full 128, then 256.
3742 const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
3743 const auto tbl_full = TableLookupBytes(bytes, from_256);
3744 // Shrink to 128, then partial.
3745 return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
3746}
3747
3748// Partial table vector
3749template <typename T, size_t N, typename TI>
3750HWY_API Vec256<TI> TableLookupBytes(const Vec128<T, N> bytes,
3751 const Vec256<TI> from) {
3752 // First expand to full 128, then 256.
3753 const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
3754 return TableLookupBytes(bytes_256, from);
3755}
3756
3757// Partial both are handled by x86_128.
3758
3759// ------------------------------ Shl (Mul, ZipLower)
3760
3761namespace detail {
3762
3763#if HWY_TARGET > HWY_AVX3 && !HWY_IDE // AVX2 or older
3764
3765// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
3766template <typename T>
3767HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
3768 static_assert(sizeof(T) == 2, "Only for 16-bit");
3769 const Full256<T> d;
3770 const RepartitionToWide<decltype(d)> dw;
3771 const Rebind<float, decltype(dw)> df;
3772 const auto zero = Zero(d);
3773 // Move into exponent (this u16 will become the upper half of an f32)
3774 const auto exp = ShiftLeft<23 - 16>(v);
3775 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
3776 // Insert 0 into lower halves for reinterpreting as binary32.
3777 const auto f0 = ZipLower(dw, zero, upper);
3778 const auto f1 = ZipUpper(dw, zero, upper);
3779 // Do not use ConvertTo because it checks for overflow, which is redundant
3780 // because we only care about v in [0, 16).
3781 const Vec256<int32_t> bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)};
3782 const Vec256<int32_t> bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)};
3783 return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3784}
3785
3786#endif // HWY_TARGET > HWY_AVX3
3787
3790#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
3791 return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
3792#else
3793 return v * Pow2(bits);
3794#endif
3795}
3796
3799 return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
3800}
3801
3804 return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
3805}
3806
3807template <typename T>
3809 // Signed left shifts are the same as unsigned.
3810 const Full256<T> di;
3811 const Full256<MakeUnsigned<T>> du;
3812 return BitCast(di,
3813 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
3814}
3815
3816} // namespace detail
3817
3818template <typename T>
3819HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
3820 return detail::Shl(hwy::TypeTag<T>(), v, bits);
3821}
3822
3823// ------------------------------ Shr (MulHigh, IfThenElse, Not)
3824
3826#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
3827 return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
3828#else
3830 // For bits=0, we cannot mul by 2^16, so fix the result later.
3831 auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
3832 // Replace output with input where bits == 0.
3833 return IfThenElse(bits == Zero(d), v, out);
3834#endif
3835}
3836
3840
3844
3846#if HWY_TARGET <= HWY_AVX3
3847 return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
3848#else
3849 return detail::SignedShr(Full256<int16_t>(), v, bits);
3850#endif
3851}
3852
3854 return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
3855}
3856
3858#if HWY_TARGET <= HWY_AVX3
3859 return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
3860#else
3861 return detail::SignedShr(Full256<int64_t>(), v, bits);
3862#endif
3863}
3864
3865HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
3866 const Vec256<uint64_t> b) {
3867 const Full256<uint64_t> du64;
3868 const RepartitionToNarrow<decltype(du64)> du32;
3869 const auto maskL = Set(du64, 0xFFFFFFFFULL);
3870 const auto a32 = BitCast(du32, a);
3871 const auto b32 = BitCast(du32, b);
3872 // Inputs for MulEven: we only need the lower 32 bits
3873 const auto aH = Shuffle2301(a32);
3874 const auto bH = Shuffle2301(b32);
3875
3876 // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
3877 // the even (lower 64 bits of every 128-bit block) results. See
3878 // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
3879 const auto aLbL = MulEven(a32, b32);
3880 const auto w3 = aLbL & maskL;
3881
3882 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3883 const auto w2 = t2 & maskL;
3884 const auto w1 = ShiftRight<32>(t2);
3885
3886 const auto t = MulEven(a32, bH) + w2;
3887 const auto k = ShiftRight<32>(t);
3888
3889 const auto mulH = MulEven(aH, bH) + w1 + k;
3890 const auto mulL = ShiftLeft<32>(t) + w3;
3891 return InterleaveLower(mulL, mulH);
3892}
3893
3894HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
3895 const Vec256<uint64_t> b) {
3896 const Full256<uint64_t> du64;
3897 const RepartitionToNarrow<decltype(du64)> du32;
3898 const auto maskL = Set(du64, 0xFFFFFFFFULL);
3899 const auto a32 = BitCast(du32, a);
3900 const auto b32 = BitCast(du32, b);
3901 // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
3902 const auto aH = Shuffle2301(a32);
3903 const auto bH = Shuffle2301(b32);
3904
3905 // Same as above, but we're using the odd results (upper 64 bits per block).
3906 const auto aLbL = MulEven(a32, b32);
3907 const auto w3 = aLbL & maskL;
3908
3909 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3910 const auto w2 = t2 & maskL;
3911 const auto w1 = ShiftRight<32>(t2);
3912
3913 const auto t = MulEven(a32, bH) + w2;
3914 const auto k = ShiftRight<32>(t);
3915
3916 const auto mulH = MulEven(aH, bH) + w1 + k;
3917 const auto mulL = ShiftLeft<32>(t) + w3;
3918 return InterleaveUpper(du64, mulL, mulH);
3919}
3920
3921// ------------------------------ ReorderWidenMulAccumulate
3925 const Vec256<int32_t> sum0,
3926 Vec256<int32_t>& /*sum1*/) {
3927 return sum0 + Vec256<int32_t>{_mm256_madd_epi16(a.raw, b.raw)};
3928}
3929
3930// ------------------------------ RearrangeToOddPlusEven
3932 Vec256<int32_t> /*sum1*/) {
3933 return sum0; // invariant already holds
3934}
3935
3936// ================================================== CONVERT
3937
3938// ------------------------------ Promotions (part w/ narrow lanes -> full)
3939
3941 const Vec128<float, 4> v) {
3942 return Vec256<double>{_mm256_cvtps_pd(v.raw)};
3943}
3944
3946 const Vec128<int32_t, 4> v) {
3947 return Vec256<double>{_mm256_cvtepi32_pd(v.raw)};
3948}
3949
3950// Unsigned: zero-extend.
3951// Note: these have 3 cycle latency; if inputs are already split across the
3952// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3955 return Vec256<uint16_t>{_mm256_cvtepu8_epi16(v.raw)};
3956}
3959 return Vec256<uint32_t>{_mm256_cvtepu8_epi32(v.raw)};
3960}
3963 return Vec256<int16_t>{_mm256_cvtepu8_epi16(v.raw)};
3964}
3967 return Vec256<int32_t>{_mm256_cvtepu8_epi32(v.raw)};
3968}
3971 return Vec256<uint32_t>{_mm256_cvtepu16_epi32(v.raw)};
3972}
3975 return Vec256<int32_t>{_mm256_cvtepu16_epi32(v.raw)};
3976}
3979 return Vec256<uint64_t>{_mm256_cvtepu32_epi64(v.raw)};
3980}
3981
3982// Signed: replicate sign bit.
3983// Note: these have 3 cycle latency; if inputs are already split across the
3984// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3985// signed shift would be faster.
3987 Vec128<int8_t> v) {
3988 return Vec256<int16_t>{_mm256_cvtepi8_epi16(v.raw)};
3989}
3992 return Vec256<int32_t>{_mm256_cvtepi8_epi32(v.raw)};
3993}
3996 return Vec256<int32_t>{_mm256_cvtepi16_epi32(v.raw)};
3997}
4000 return Vec256<int64_t>{_mm256_cvtepi32_epi64(v.raw)};
4001}
4002
4003// ------------------------------ Demotions (full -> part w/ narrow lanes)
4004
4005HWY_API Vec128<uint16_t> DemoteTo(Full128<uint16_t> /* tag */,
4006 const Vec256<int32_t> v) {
4007 const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
4008 // Concatenating lower halves of both 128-bit blocks afterward is more
4009 // efficient than an extra input with low block = high block of v.
4010 return Vec128<uint16_t>{
4011 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
4012}
4013
4014HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
4015 const Vec256<int32_t> v) {
4016 const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
4017 return Vec128<int16_t>{
4018 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
4019}
4020
4021HWY_API Vec128<uint8_t, 8> DemoteTo(Full64<uint8_t> /* tag */,
4022 const Vec256<int32_t> v) {
4023 const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
4024 // Concatenate lower 64 bits of each 128-bit block
4025 const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
4026 const __m128i u16 = _mm256_castsi256_si128(u16_concat);
4027 // packus treats the input as signed; we want unsigned. Clear the MSB to get
4028 // unsigned saturation to u8.
4029 const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
4030 return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
4031}
4032
4033HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
4034 const Vec256<int16_t> v) {
4035 const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw);
4036 return Vec128<uint8_t>{
4037 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
4038}
4039
4040HWY_API Vec128<int8_t, 8> DemoteTo(Full64<int8_t> /* tag */,
4041 const Vec256<int32_t> v) {
4042 const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
4043 // Concatenate lower 64 bits of each 128-bit block
4044 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
4045 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
4046 return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
4047}
4048
4049HWY_API Vec128<int8_t> DemoteTo(Full128<int8_t> /* tag */,
4050 const Vec256<int16_t> v) {
4051 const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
4052 return Vec128<int8_t>{
4053 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
4054}
4055
4056 // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
4057 // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
4058HWY_DIAGNOSTICS(push)
4059HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
4060
4061HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> df16,
4062 const Vec256<float> v) {
4063#ifdef HWY_DISABLE_F16C
4064 const RebindToUnsigned<decltype(df16)> du16;
4065 const Rebind<uint32_t, decltype(df16)> du;
4066 const RebindToSigned<decltype(du)> di;
4067 const auto bits32 = BitCast(du, v);
4068 const auto sign = ShiftRight<31>(bits32);
4069 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
4070 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
4071
4072 const auto k15 = Set(di, 15);
4073 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
4074 const auto is_tiny = exp < Set(di, -24);
4075
4076 const auto is_subnormal = exp < Set(di, -14);
4077 const auto biased_exp16 =
4078 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
4079 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
4080 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
4081 (mantissa32 >> (Set(du, 13) + sub_exp));
4082 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
4083 ShiftRight<13>(mantissa32)); // <1024
4084
4085 const auto sign16 = ShiftLeft<15>(sign);
4086 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
4087 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
4088 return BitCast(df16, DemoteTo(du16, bits16));
4089#else
4090 (void)df16;
4091 return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
4092#endif
4093}
4094
4095HWY_DIAGNOSTICS(pop)
4096
4097HWY_API Vec128<bfloat16_t> DemoteTo(Full128<bfloat16_t> dbf16,
4098 const Vec256<float> v) {
4099 // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
4100 const Rebind<int32_t, decltype(dbf16)> di32;
4101 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4102 const Rebind<uint16_t, decltype(dbf16)> du16;
4103 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4104 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4105}
4106
4107HWY_API Vec256<bfloat16_t> ReorderDemote2To(Full256<bfloat16_t> dbf16,
4108 Vec256<float> a, Vec256<float> b) {
4109 // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
4110 const RebindToUnsigned<decltype(dbf16)> du16;
4111 const Repartition<uint32_t, decltype(dbf16)> du32;
4112 const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
4113 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4114}
4115
4116HWY_API Vec256<int16_t> ReorderDemote2To(Full256<int16_t> /*d16*/,
4117 Vec256<int32_t> a, Vec256<int32_t> b) {
4118 return Vec256<int16_t>{_mm256_packs_epi32(a.raw, b.raw)};
4119}
4120
4122 const Vec256<double> v) {
4123 return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
4124}
4125
4126HWY_API Vec128<int32_t> DemoteTo(Full128<int32_t> /* tag */,
4127 const Vec256<double> v) {
4128 const auto clamped = detail::ClampF64ToI32Max(Full256<double>(), v);
4129 return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
4130}
4131
4132// For already range-limited input [0, 255].
4133HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
4134 const Full256<uint32_t> d32;
4135 alignas(32) static constexpr uint32_t k8From32[8] = {
4136 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
4137 // Place first four bytes in lo[0], remaining 4 in hi[1].
4138 const auto quad = TableLookupBytes(v, Load(d32, k8From32));
4139 // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
4140 const auto lo = LowerHalf(quad);
4141 const auto hi = UpperHalf(Full128<uint32_t>(), quad);
4142 const auto pair = LowerHalf(lo | hi);
4143 return BitCast(Full64<uint8_t>(), pair);
4144}
4145
4146// ------------------------------ Truncations
4147
4148namespace detail {
4149
4150// LO and HI each hold four indices of bytes within a 128-bit block.
4151template <uint32_t LO, uint32_t HI, typename T>
4153 const Full256<uint32_t> d32;
4154
4155#if HWY_TARGET <= HWY_AVX3_DL
4156 alignas(32) constexpr uint32_t kMap[8] = {
4157 LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
4158 const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
4159#else
4160 alignas(32) static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u,
4161 ~0u, ~0u, LO, HI};
4162 const auto quad = TableLookupBytes(v, Load(d32, kMap));
4163 const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
4164 // Possible alternative:
4165 // const auto lo = LowerHalf(quad);
4166 // const auto hi = UpperHalf(Full128<uint32_t>(), quad);
4167 // const auto result = lo | hi;
4168#endif
4169
4170 return Vec128<uint32_t>{_mm256_castsi256_si128(result)};
4171}
4172
4173// LO and HI each hold two indices of bytes within a 128-bit block.
4174template <uint16_t LO, uint16_t HI, typename T>
4176 const Full256<uint16_t> d16;
4177
4178#if HWY_TARGET <= HWY_AVX3_DL
4179 alignas(32) constexpr uint16_t kMap[16] = {
4180 LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
4181 const auto result = _mm256_permutexvar_epi8(v.raw, Load(d16, kMap).raw);
4182 return LowerHalf(Vec128<uint32_t>{_mm256_castsi256_si128(result)});
4183#else
4184 constexpr uint16_t ff = static_cast<uint16_t>(~0u);
4185 alignas(32) static constexpr uint16_t kMap[16] = {
4186 LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
4187 const auto quad = TableLookupBytes(v, Load(d16, kMap));
4188 const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
4189 const auto half = _mm256_castsi256_si128(mixed);
4190 return LowerHalf(Vec128<uint32_t>{_mm_packus_epi32(half, half)});
4191#endif
4192}
4193
4194} // namespace detail
4195
4197 const Vec256<uint64_t> v) {
4198 const Full256<uint32_t> d32;
4199#if HWY_TARGET <= HWY_AVX3_DL
4200 alignas(32) constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0};
4201 const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
4202 return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{result})));
4203#else
4204 alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
4205 0x0800FFFFu, ~0u, ~0u, ~0u};
4206 const auto quad = TableLookupBytes(v, Load(d32, kMap));
4207 const auto lo = LowerHalf(quad);
4208 const auto hi = UpperHalf(Full128<uint32_t>(), quad);
4209 const auto result = lo | hi;
4210 return LowerHalf(LowerHalf(Vec128<uint8_t>{result.raw}));
4211#endif
4212}
4213
4219
4221 const Vec256<uint64_t> v) {
4222 const Full256<uint32_t> d32;
4223 alignas(32) constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
4224 const auto v32 =
4225 TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven));
4226 return LowerHalf(Vec256<uint32_t>{v32.raw});
4227}
4228
4234
4240
4246
4247// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
4248
4250 const Vec256<int32_t> v) {
4251 return Vec256<float>{_mm256_cvtepi32_ps(v.raw)};
4252}
4253
4255#if HWY_TARGET <= HWY_AVX3
4256 (void)dd;
4257 return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
4258#else
4259 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4260 const Repartition<uint32_t, decltype(dd)> d32;
4261 const Repartition<uint64_t, decltype(dd)> d64;
4262
4263 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
4264 const auto k84_63 = Set(d64, 0x4530000080000000ULL);
4265 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
4266
4267 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
4268 const auto k52 = Set(d32, 0x43300000);
4269 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
4270
4271 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
4272 return (v_upper - k84_63_52) + v_lower; // order matters!
4273#endif
4274}
4275
4277 const Vec256<uint32_t> v) {
4278#if HWY_TARGET <= HWY_AVX3
4279 return Vec256<float>{_mm256_cvtepu32_ps(v.raw)};
4280#else
4281 // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
4282 const RebindToUnsigned<decltype(df)> du32;
4283 const RebindToSigned<decltype(df)> d32;
4284
4285 const auto msk_lo = Set(du32, 0xFFFF);
4286 const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
4287
4288 // Extract the 16 lowest/highest significant bits of v and cast to signed int
4289 const auto v_lo = BitCast(d32, And(v, msk_lo));
4290 const auto v_hi = BitCast(d32, ShiftRight<16>(v));
4291
4292 return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
4293#endif
4294}
4295
4297 const Vec256<uint64_t> v) {
4298#if HWY_TARGET <= HWY_AVX3
4299 return Vec256<double>{_mm256_cvtepu64_pd(v.raw)};
4300#else
4301 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4302 const RebindToUnsigned<decltype(dd)> d64;
4303 using VU = VFromD<decltype(d64)>;
4304
4305 const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
4306 const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
4307
4308 // Extract the 32 lowest significant bits of v
4309 const VU v_lo = And(v, msk_lo);
4310 const VU v_hi = ShiftRight<32>(v);
4311
4312 auto uint64_to_double256_fast = [&dd](Vec256<uint64_t> w) HWY_ATTR {
4314 detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
4315 return BitCast(dd, w) - Set(dd, 0x0010000000000000);
4316 };
4317
4318 const auto v_lo_dbl = uint64_to_double256_fast(v_lo);
4319 return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl);
4320#endif
4321}
4322
4323// Truncates (rounds toward zero).
4325 return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
4326}
4327
4329#if HWY_TARGET <= HWY_AVX3
4330 return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw));
4331#else
4332 using VI = decltype(Zero(di));
4333 const VI k0 = Zero(di);
4334 const VI k1 = Set(di, 1);
4335 const VI k51 = Set(di, 51);
4336
4337 // Exponent indicates whether the number can be represented as int64_t.
4338 const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
4339 const VI exp = biased_exp - Set(di, 0x3FF);
4340 const auto in_range = exp < Set(di, 63);
4341
4342 // If we were to cap the exponent at 51 and add 2^52, the number would be in
4343 // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4344 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
4345 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
4346 // manually shift the mantissa into place (we already have many of the
4347 // inputs anyway).
4348 const VI shift_mnt = Max(k51 - exp, k0);
4349 const VI shift_int = Max(exp - k51, k0);
4350 const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
4351 // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
4352 const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4353 // For inputs larger than 2^52, insert zeros at the bottom.
4354 const VI shifted = int52 << shift_int;
4355 // Restore the one bit lost when shifting in the implicit 1-bit.
4356 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4357
4358 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
4359 const VI sign_mask = BroadcastSignBit(BitCast(di, v));
4360 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
4361 const VI magnitude = IfThenElse(in_range, restored, limit);
4362
4363 // If the input was negative, negate the integer (two's complement).
4364 return (magnitude ^ sign_mask) - sign_mask;
4365#endif
4366}
4367
4368HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
4369 const Full256<int32_t> di;
4370 return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw));
4371}
4372
4373
4375 const Vec128<float16_t> v) {
4376#ifdef HWY_DISABLE_F16C
4377 const RebindToSigned<decltype(df32)> di32;
4378 const RebindToUnsigned<decltype(df32)> du32;
4379 // Expand to u32 so we can shift.
4380 const auto bits16 = PromoteTo(du32, Vec128<uint16_t>{v.raw});
4381 const auto sign = ShiftRight<15>(bits16);
4382 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
4383 const auto mantissa = bits16 & Set(du32, 0x3FF);
4384 const auto subnormal =
4385 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
4386 Set(df32, 1.0f / 16384 / 1024));
4387
4388 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
4389 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
4390 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4391 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
4392 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4393#else
4394 (void)df32;
4395 return Vec256<float>{_mm256_cvtph_ps(v.raw)};
4396#endif
4397}
4398
4400 const Vec128<bfloat16_t> v) {
4401 const Rebind<uint16_t, decltype(df32)> du16;
4402 const RebindToSigned<decltype(df32)> di32;
4403 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4404}
4405
4406// ================================================== CRYPTO
4407
4408#if !defined(HWY_DISABLE_PCLMUL_AES)
4409
4410// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4411#ifdef HWY_NATIVE_AES
4412#undef HWY_NATIVE_AES
4413#else
4414#define HWY_NATIVE_AES
4415#endif
4416
4418 Vec256<uint8_t> round_key) {
4419#if HWY_TARGET == HWY_AVX3_DL
4420 return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)};
4421#else
4422 const Full256<uint8_t> d;
4423 const Half<decltype(d)> d2;
4424 return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
4425 AESRound(LowerHalf(state), LowerHalf(round_key)));
4426#endif
4427}
4428
4430 Vec256<uint8_t> round_key) {
4431#if HWY_TARGET == HWY_AVX3_DL
4432 return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
4433#else
4434 const Full256<uint8_t> d;
4435 const Half<decltype(d)> d2;
4436 return Combine(d,
4437 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
4438 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
4439#endif
4440}
4441
4443#if HWY_TARGET == HWY_AVX3_DL
4444 return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
4445#else
4446 const Full256<uint64_t> d;
4447 const Half<decltype(d)> d2;
4448 return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)),
4450#endif
4451}
4452
4454#if HWY_TARGET == HWY_AVX3_DL
4455 return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)};
4456#else
4457 const Full256<uint64_t> d;
4458 const Half<decltype(d)> d2;
4459 return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)),
4461#endif
4462}
4463
4464#endif // HWY_DISABLE_PCLMUL_AES
4465
4466// ================================================== MISC
4467
4468// Returns a vector with lane i=[0, N) set to "first" + i.
4469template <typename T, typename T2>
4470HWY_API Vec256<T> Iota(const Full256<T> d, const T2 first) {
4471 HWY_ALIGN T lanes[32 / sizeof(T)];
4472 for (size_t i = 0; i < 32 / sizeof(T); ++i) {
4473 lanes[i] =
4474 AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
4475 }
4476 return Load(d, lanes);
4477}
4478
4479#if HWY_TARGET <= HWY_AVX3
4480
4481// ------------------------------ LoadMaskBits
4482
4483// `p` points to at least 8 readable bytes, not all of which need be valid.
4484template <typename T>
4486 const uint8_t* HWY_RESTRICT bits) {
4487 constexpr size_t N = 32 / sizeof(T);
4488 constexpr size_t kNumBytes = (N + 7) / 8;
4489
4490 uint64_t mask_bits = 0;
4491 CopyBytes<kNumBytes>(bits, &mask_bits);
4492
4493 if (N < 8) {
4494 mask_bits &= (1ull << N) - 1;
4495 }
4496
4497 return Mask256<T>::FromBits(mask_bits);
4498}
4499
4500// ------------------------------ StoreMaskBits
4501
4502// `p` points to at least 8 writable bytes.
4503template <typename T>
4504HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4505 uint8_t* bits) {
4506 constexpr size_t N = 32 / sizeof(T);
4507 constexpr size_t kNumBytes = (N + 7) / 8;
4508
4510
4511 // Non-full byte, need to clear the undefined upper bits.
4512 if (N < 8) {
4513 const int mask_bits = static_cast<int>((1ull << N) - 1);
4514 bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
4515 }
4516 return kNumBytes;
4517}
4518
4519// ------------------------------ Mask testing
4520
4521template <typename T>
4522HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4523 return PopCount(static_cast<uint64_t>(mask.raw));
4524}
4525
4526template <typename T>
4527HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
4528 const Mask256<T> mask) {
4529 return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
4530}
4531
4532template <typename T>
4533HWY_API intptr_t FindFirstTrue(const Full256<T> d, const Mask256<T> mask) {
4534 return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
4535 : intptr_t{-1};
4536}
4537
4538// Beware: the suffix indicates the number of mask bits, not lane size!
4539
4540namespace detail {
4541
4542template <typename T>
4544#if HWY_COMPILER_HAS_MASK_INTRINSICS
4545 return _kortestz_mask32_u8(mask.raw, mask.raw);
4546#else
4547 return mask.raw == 0;
4548#endif
4549}
4550template <typename T>
4552#if HWY_COMPILER_HAS_MASK_INTRINSICS
4553 return _kortestz_mask16_u8(mask.raw, mask.raw);
4554#else
4555 return mask.raw == 0;
4556#endif
4557}
4558template <typename T>
4560#if HWY_COMPILER_HAS_MASK_INTRINSICS
4561 return _kortestz_mask8_u8(mask.raw, mask.raw);
4562#else
4563 return mask.raw == 0;
4564#endif
4565}
4566template <typename T>
4568 return (uint64_t{mask.raw} & 0xF) == 0;
4569}
4570
4571} // namespace detail
4572
4573template <typename T>
4574HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4575 return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
4576}
4577
4578namespace detail {
4579
4580template <typename T>
4582#if HWY_COMPILER_HAS_MASK_INTRINSICS
4583 return _kortestc_mask32_u8(mask.raw, mask.raw);
4584#else
4585 return mask.raw == 0xFFFFFFFFu;
4586#endif
4587}
4588template <typename T>
4590#if HWY_COMPILER_HAS_MASK_INTRINSICS
4591 return _kortestc_mask16_u8(mask.raw, mask.raw);
4592#else
4593 return mask.raw == 0xFFFFu;
4594#endif
4595}
4596template <typename T>
4598#if HWY_COMPILER_HAS_MASK_INTRINSICS
4599 return _kortestc_mask8_u8(mask.raw, mask.raw);
4600#else
4601 return mask.raw == 0xFFu;
4602#endif
4603}
4604template <typename T>
4606 // Cannot use _kortestc because we have less than 8 mask bits.
4607 return mask.raw == 0xFu;
4608}
4609
4610} // namespace detail
4611
4612template <typename T>
4613HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4614 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
4615}
4616
4617// ------------------------------ Compress
4618
4619// 16-bit is defined in x86_512 so we can use 512-bit vectors.
4620
4621template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4623 return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
4624}
4625
4627 return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)};
4628}
4629
4630template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4631HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
4632 // See CompressIsPartition.
4633 alignas(16) constexpr uint64_t packed_array[16] = {
4634 // PrintCompress64x4NibbleTables
4635 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
4636 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
4637 0x00001032, 0x00001320, 0x00000321, 0x00003210};
4638
4639 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
4640 // _mm256_permutexvar_epi64 will ignore the upper bits.
4641 const Full256<T> d;
4642 const RebindToUnsigned<decltype(d)> du64;
4643 const auto packed = Set(du64, packed_array[mask.raw]);
4644 alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4645 const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
4646 return TableLookupLanes(v, indices);
4647}
4648
4649// ------------------------------ CompressNot (Compress)
4650
4651// Implemented in x86_512 for lane size != 8.
4652
4653template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4655 // See CompressIsPartition.
4656 alignas(16) constexpr uint64_t packed_array[16] = {
4657 // PrintCompressNot64x4NibbleTables
4658 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
4659 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
4660 0x00003210, 0x00003201, 0x00003210, 0x00003210};
4661
4662 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
4663 // _mm256_permutexvar_epi64 will ignore the upper bits.
4664 const Full256<T> d;
4665 const RebindToUnsigned<decltype(d)> du64;
4666 const auto packed = Set(du64, packed_array[mask.raw]);
4667 alignas(32) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4668 const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
4669 return TableLookupLanes(v, indices);
4670}
4671
4672// ------------------------------ CompressStore
4673
4674// 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512.
4675
4676template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4678 T* HWY_RESTRICT unaligned) {
4679 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
4680 const size_t count = PopCount(uint64_t{mask.raw});
4681 detail::MaybeUnpoison(unaligned, count);
4682 return count;
4683}
4684
4685template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4686HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
4687 T* HWY_RESTRICT unaligned) {
4688 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4689 const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4690 detail::MaybeUnpoison(unaligned, count);
4691 return count;
4692}
4693
4695 Full256<float> /* tag */,
4696 float* HWY_RESTRICT unaligned) {
4697 _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
4698 const size_t count = PopCount(uint64_t{mask.raw});
4699 detail::MaybeUnpoison(unaligned, count);
4700 return count;
4701}
4702
4704 Full256<double> /* tag */,
4705 double* HWY_RESTRICT unaligned) {
4706 _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
4707 const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4708 detail::MaybeUnpoison(unaligned, count);
4709 return count;
4710}
4711
4712// ------------------------------ CompressBlendedStore (CompressStore)
4713
4714template <typename T>
4715HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4716 T* HWY_RESTRICT unaligned) {
4717 if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) > 2) {
4718 // Native (32 or 64-bit) AVX-512 instruction already does the blending at no
4719 // extra cost (latency 11, rthroughput 2 - same as compress plus store).
4720 return CompressStore(v, m, d, unaligned);
4721 } else {
4722 const size_t count = CountTrue(d, m);
4723 BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
4724 detail::MaybeUnpoison(unaligned, count);
4725 return count;
4726 }
4727}
4728
4729// ------------------------------ CompressBitsStore (LoadMaskBits)
4730
4731template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
4733 Full256<T> d, T* HWY_RESTRICT unaligned) {
4734 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4735}
4736
4737#else // AVX2
4738
4739// ------------------------------ LoadMaskBits (TestBit)
4740
4741namespace detail {
4742
4743// 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_LE128 there.
4744template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4745HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4746 const RebindToUnsigned<decltype(d)> du;
4747 const Repartition<uint32_t, decltype(d)> du32;
4748 const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
4749
4750 // Replicate bytes 8x such that each byte contains the bit that governs it.
4751 const Repartition<uint64_t, decltype(d)> du64;
4752 alignas(32) constexpr uint64_t kRep8[4] = {
4753 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4754 0x0303030303030303ull};
4755 const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
4756
4757 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4758 1, 2, 4, 8, 16, 32, 64, 128};
4759 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4760}
4761
4762template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4763HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4764 const RebindToUnsigned<decltype(d)> du;
4765 alignas(32) constexpr uint16_t kBit[16] = {
4766 1, 2, 4, 8, 16, 32, 64, 128,
4767 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4768 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4769 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4770}
4771
4772template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4773HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4774 const RebindToUnsigned<decltype(d)> du;
4775 alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4776 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4777 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4778}
4779
4780template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4781HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4782 const RebindToUnsigned<decltype(d)> du;
4783 alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4784 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4785}
4786
4787} // namespace detail
4788
4789// `p` points to at least 8 readable bytes, not all of which need be valid.
4790template <typename T>
4791HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
4792 const uint8_t* HWY_RESTRICT bits) {
4793 constexpr size_t N = 32 / sizeof(T);
4794 constexpr size_t kNumBytes = (N + 7) / 8;
4795
4796 uint64_t mask_bits = 0;
4797 CopyBytes<kNumBytes>(bits, &mask_bits);
4798
4799 if (N < 8) {
4800 mask_bits &= (1ull << N) - 1;
4801 }
4802
4803 return detail::LoadMaskBits256(d, mask_bits);
4804}
4805
4806// ------------------------------ StoreMaskBits
4807
4808namespace detail {
4809
4810template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4811HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4812 const Full256<T> d;
4813 const Full256<uint8_t> d8;
4814 const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
4815 // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
4816 return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
4817}
4818
4819template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4820HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4821#if HWY_ARCH_X86_64
4822 const Full256<T> d;
4823 const Full256<uint8_t> d8;
4824 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4825 const uint64_t sign_bits8 = BitsFromMask(mask8);
4826 // Skip the bits from the lower byte of each u16 (better not to use the
4827 // same packs_epi16 as SSE4, because that requires an extra swizzle here).
4828 return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4829#else
4830 // Slow workaround for 32-bit builds, which lack _pext_u64.
4831 // Remove useless lower half of each u16 while preserving the sign bit.
4832 // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes.
4833 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4834 // Move odd qwords (value zero) to top so they don't affect the mask value.
4835 const auto compressed =
4836 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4837 return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4838#endif // HWY_ARCH_X86_64
4839}
4840
4841template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4842HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4843 const Full256<T> d;
4844 const Full256<float> df;
4845 const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4846 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4847}
4848
4849template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4850HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4851 const Full256<T> d;
4852 const Full256<double> df;
4853 const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4854 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4855}
4856
4857} // namespace detail
4858
4859// `p` points to at least 8 writable bytes.
4860template <typename T>
4861HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4862 uint8_t* bits) {
4863 constexpr size_t N = 32 / sizeof(T);
4864 constexpr size_t kNumBytes = (N + 7) / 8;
4865
4866 const uint64_t mask_bits = detail::BitsFromMask(mask);
4867 CopyBytes<kNumBytes>(&mask_bits, bits);
4868 return kNumBytes;
4869}
4870
4871// ------------------------------ Mask testing
4872
4873// Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask
4874// lane is 0 or ~0.
4875template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4876HWY_API bool AllFalse(const Full256<T> d, const Mask256<T> mask) {
4877 const Repartition<uint8_t, decltype(d)> d8;
4878 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4879 return detail::BitsFromMask(mask8) == 0;
4880}
4881
4882template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4883HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4884 // Cheaper than PTEST, which is 2 uop / 3L.
4885 return detail::BitsFromMask(mask) == 0;
4886}
4887
4888template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4889HWY_API bool AllTrue(const Full256<T> d, const Mask256<T> mask) {
4890 const Repartition<uint8_t, decltype(d)> d8;
4891 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4892 return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
4893}
4894template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4895HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4896 constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1;
4897 return detail::BitsFromMask(mask) == kAllBits;
4898}
4899
4900template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4901HWY_API size_t CountTrue(const Full256<T> d, const Mask256<T> mask) {
4902 const Repartition<uint8_t, decltype(d)> d8;
4903 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4904 return PopCount(detail::BitsFromMask(mask8)) >> 1;
4905}
4906template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4907HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4908 return PopCount(detail::BitsFromMask(mask));
4909}
4910
4911template <typename T>
4912HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
4913 const Mask256<T> mask) {
4914 const uint64_t mask_bits = detail::BitsFromMask(mask);
4915 return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
4916}
4917
4918template <typename T>
4919HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
4920 const Mask256<T> mask) {
4921 const uint64_t mask_bits = detail::BitsFromMask(mask);
4922 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
4923}
4924
4925// ------------------------------ Compress, CompressBits
4926
4927namespace detail {
4928
4929template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4930HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
4931 const RebindToUnsigned<decltype(d)> d32;
4932 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
4933 // of SetTableIndices would require 8 KiB, a large part of L1D. The other
4934 // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
4935 // and unavailable in 32-bit builds. We instead compress each index into 4
4936 // bits, for a total of 1 KiB.
4937 alignas(16) constexpr uint32_t packed_array[256] = {
4938 // PrintCompress32x8Tables
4939 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
4940 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
4941 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
4942 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
4943 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
4944 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
4945 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
4946 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
4947 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
4948 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
4949 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
4950 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
4951 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
4952 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
4953 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
4954 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
4955 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
4956 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
4957 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
4958 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
4959 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
4960 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
4961 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
4962 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
4963 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
4964 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
4965 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
4966 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
4967 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
4968 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
4969 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
4970 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
4971 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
4972 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
4973 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
4974 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
4975 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
4976 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
4977 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
4978 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
4979 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
4980 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
4981 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
4982
4983 // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
4984 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4985 // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
4986 // latency, it may be faster to use LoadDup128 and PSHUFB.
4987 const auto packed = Set(d32, packed_array[mask_bits]);
4988 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4989 return packed >> Load(d32, shifts);
4990}
4991
4992template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4993HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
4994 const Repartition<uint32_t, decltype(d)> d32;
4995
4996 // For 64-bit, we still need 32-bit indices because there is no 64-bit
4997 // permutevar, but there are only 4 lanes, so we can afford to skip the
4998 // unpacking and load the entire index vector directly.
4999 alignas(32) constexpr uint32_t u32_indices[128] = {
5000 // PrintCompress64x4PairTables
5001 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7,
5002 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7,
5003 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7,
5004 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7,
5005 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5,
5006 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5,
5007 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3,
5008 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
5009 return Load(d32, u32_indices + 8 * mask_bits);
5010}
5011
5012template <typename T, HWY_IF_LANE_SIZE(T, 4)>
5013HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
5014 uint64_t mask_bits) {
5015 const RebindToUnsigned<decltype(d)> d32;
5016 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
5017 // of SetTableIndices would require 8 KiB, a large part of L1D. The other
5018 // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
5019 // and unavailable in 32-bit builds. We instead compress each index into 4
5020 // bits, for a total of 1 KiB.
5021 alignas(16) constexpr uint32_t packed_array[256] = {
5022 // PrintCompressNot32x8Tables
5023 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
5024 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
5025 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
5026 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
5027 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
5028 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
5029 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
5030 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
5031 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
5032 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
5033 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
5034 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
5035 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
5036 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
5037 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
5038 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
5039 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
5040 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
5041 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
5042 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
5043 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
5044 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
5045 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
5046 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
5047 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
5048 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
5049 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
5050 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
5051 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
5052 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
5053 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
5054 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
5055 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
5056 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
5057 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
5058 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
5059 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
5060 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
5061 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
5062 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
5063 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
5064 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
5065 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
5066
5067 // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
5068 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
5069 // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
5070 // latency, it may be faster to use LoadDup128 and PSHUFB.
5071 const auto packed = Set(d32, packed_array[mask_bits]);
5072 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
5073 return packed >> Load(d32, shifts);
5074}
5075
5076template <typename T, HWY_IF_LANE_SIZE(T, 8)>
5077HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
5078 uint64_t mask_bits) {
5079 const Repartition<uint32_t, decltype(d)> d32;
5080
5081 // For 64-bit, we still need 32-bit indices because there is no 64-bit
5082 // permutevar, but there are only 4 lanes, so we can afford to skip the
5083 // unpacking and load the entire index vector directly.
5084 alignas(32) constexpr uint32_t u32_indices[128] = {
5085 // PrintCompressNot64x4PairTables
5086 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9,
5087 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11,
5088 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13,
5089 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13,
5090 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15,
5091 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15,
5092 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15,
5093 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
5094 return Load(d32, u32_indices + 8 * mask_bits);
5095}
5096template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
5097HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
5098 const Full256<T> d;
5099 const Repartition<uint32_t, decltype(d)> du32;
5100
5101 HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
5102 // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
5103 // no instruction for 4x64).
5104 const Indices256<uint32_t> indices{IndicesFromBits(d, mask_bits).raw};
5105 return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
5106}
5107
5108// LUTs are infeasible for 2^16 possible masks, so splice together two
5109// half-vector Compress.
5110template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5111HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
5112 const Full256<T> d;
5113 const RebindToUnsigned<decltype(d)> du;
5114 const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
5115 const Half<decltype(du)> duh;
5116 const auto half0 = LowerHalf(duh, vu16);
5117 const auto half1 = UpperHalf(duh, vu16);
5118
5119 const uint64_t mask_bits0 = mask_bits & 0xFF;
5120 const uint64_t mask_bits1 = mask_bits >> 8;
5121 const auto compressed0 = detail::CompressBits(half0, mask_bits0);
5122 const auto compressed1 = detail::CompressBits(half1, mask_bits1);
5123
5124 alignas(32) uint16_t all_true[16] = {};
5125 // Store mask=true lanes, left to right.
5126 const size_t num_true0 = PopCount(mask_bits0);
5127 Store(compressed0, duh, all_true);
5128 StoreU(compressed1, duh, all_true + num_true0);
5129
5131 // Store mask=false lanes, right to left. The second vector fills the upper
5132 // half with right-aligned false lanes. The first vector is shifted
5133 // rightwards to overwrite the true lanes of the second.
5134 alignas(32) uint16_t all_false[16] = {};
5135 const size_t num_true1 = PopCount(mask_bits1);
5136 Store(compressed1, duh, all_false + 8);
5137 StoreU(compressed0, duh, all_false + num_true1);
5138
5139 const auto mask = FirstN(du, num_true0 + num_true1);
5140 return BitCast(d,
5141 IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
5142 } else {
5143 // Only care about the mask=true lanes.
5144 return BitCast(d, Load(du, all_true));
5145 }
5146}
5147
5148template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)> // 4 or 8 bytes
5149HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
5150 const Full256<T> d;
5151 const Repartition<uint32_t, decltype(d)> du32;
5152
5153 HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
5154 // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
5155 // no instruction for 4x64).
5156 const Indices256<uint32_t> indices{IndicesFromNotBits(d, mask_bits).raw};
5157 return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
5158}
5159
5160// LUTs are infeasible for 2^16 possible masks, so splice together two
5161// half-vector Compress.
5162template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5163HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
5164 // Compress ensures only the lower 16 bits are set, so flip those.
5165 return Compress(v, mask_bits ^ 0xFFFF);
5166}
5167
5168} // namespace detail
5169
5170template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5171HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
5173}
5174
5175template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5176HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
5178}
5179
5180HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
5181 Mask256<uint64_t> mask) {
5182 return CompressNot(v, mask);
5183}
5184
5185template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5186HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
5187 constexpr size_t N = 32 / sizeof(T);
5188 constexpr size_t kNumBytes = (N + 7) / 8;
5189
5190 uint64_t mask_bits = 0;
5191 CopyBytes<kNumBytes>(bits, &mask_bits);
5192
5193 if (N < 8) {
5194 mask_bits &= (1ull << N) - 1;
5195 }
5196
5197 return detail::Compress(v, mask_bits);
5198}
5199
5200// ------------------------------ CompressStore, CompressBitsStore
5201
5202template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5203HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
5204 T* HWY_RESTRICT unaligned) {
5205 const uint64_t mask_bits = detail::BitsFromMask(m);
5206 const size_t count = PopCount(mask_bits);
5207 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5208 detail::MaybeUnpoison(unaligned, count);
5209 return count;
5210}
5211
5212template <typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)> // 4 or 8 bytes
5213HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
5214 T* HWY_RESTRICT unaligned) {
5215 const uint64_t mask_bits = detail::BitsFromMask(m);
5216 const size_t count = PopCount(mask_bits);
5217
5218 const Repartition<uint32_t, decltype(d)> du32;
5219 HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
5220 // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
5221 // no instruction for 4x64). Nibble MSB encodes FirstN.
5222 const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(d, mask_bits);
5223 // Shift nibble MSB into MSB
5224 const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_and_mask));
5225 // First cast to unsigned (RebindMask cannot change lane size)
5226 const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
5227 const Mask256<T> mask = RebindMask(d, mask_u);
5228 const Vec256<T> compressed =
5230 Indices256<uint32_t>{idx_and_mask.raw}));
5231
5232 BlendedStore(compressed, mask, d, unaligned);
5233 detail::MaybeUnpoison(unaligned, count);
5234 return count;
5235}
5236
5237template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5238HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
5239 T* HWY_RESTRICT unaligned) {
5240 const uint64_t mask_bits = detail::BitsFromMask(m);
5241 const size_t count = PopCount(mask_bits);
5242 const Vec256<T> compressed = detail::Compress(v, mask_bits);
5243
5244#if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN
5245 // BlendedStore tests mask for each lane, but we know that the mask is
5246 // FirstN, so we can just copy.
5247 alignas(32) T buf[16];
5248 Store(compressed, d, buf);
5249 memcpy(unaligned, buf, count * sizeof(T));
5250#else
5251 BlendedStore(compressed, FirstN(d, count), d, unaligned);
5252#endif
5253 return count;
5254}
5255
5256template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5257HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
5258 Full256<T> d, T* HWY_RESTRICT unaligned) {
5259 constexpr size_t N = 32 / sizeof(T);
5260 constexpr size_t kNumBytes = (N + 7) / 8;
5261
5262 uint64_t mask_bits = 0;
5263 CopyBytes<kNumBytes>(bits, &mask_bits);
5264
5265 if (N < 8) {
5266 mask_bits &= (1ull << N) - 1;
5267 }
5268 const size_t count = PopCount(mask_bits);
5269
5270 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5271 detail::MaybeUnpoison(unaligned, count);
5272 return count;
5273}
5274
5275#endif // HWY_TARGET <= HWY_AVX3
5276
5277// ------------------------------ LoadInterleaved3/4
5278
5279// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
5280
5281namespace detail {
5282
5283// Input:
5284// 1 0 (<- first block of unaligned)
5285// 3 2
5286// 5 4
5287// Output:
5288// 3 0
5289// 4 1
5290// 5 2
5291template <typename T>
5293 const T* HWY_RESTRICT unaligned,
5294 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
5295 constexpr size_t N = 32 / sizeof(T);
5296 const Vec256<T> v10 = LoadU(d, unaligned + 0 * N); // 1 0
5297 const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
5298 const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
5299
5300 A = ConcatUpperLower(d, v32, v10);
5301 B = ConcatLowerUpper(d, v54, v10);
5302 C = ConcatUpperLower(d, v54, v32);
5303}
5304
5305// Input (128-bit blocks):
5306// 1 0 (first block of unaligned)
5307// 3 2
5308// 5 4
5309// 7 6
5310// Output:
5311// 4 0 (LSB of A)
5312// 5 1
5313// 6 2
5314// 7 3
5315template <typename T>
5317 const T* HWY_RESTRICT unaligned,
5318 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C,
5319 Vec256<T>& D) {
5320 constexpr size_t N = 32 / sizeof(T);
5321 const Vec256<T> v10 = LoadU(d, unaligned + 0 * N);
5322 const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
5323 const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
5324 const Vec256<T> v76 = LoadU(d, unaligned + 3 * N);
5325
5326 A = ConcatLowerLower(d, v54, v10);
5327 B = ConcatUpperUpper(d, v54, v10);
5328 C = ConcatLowerLower(d, v76, v32);
5329 D = ConcatUpperUpper(d, v76, v32);
5330}
5331
5332} // namespace detail
5333
5334// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
5335
5336// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
5337
5338namespace detail {
5339
5340// Input (128-bit blocks):
5341// 2 0 (LSB of i)
5342// 3 1
5343// Output:
5344// 1 0
5345// 3 2
5346template <typename T>
5347HWY_API void StoreTransposedBlocks2(const Vec256<T> i, const Vec256<T> j,
5348 const Full256<T> d,
5349 T* HWY_RESTRICT unaligned) {
5350 constexpr size_t N = 32 / sizeof(T);
5351 const auto out0 = ConcatLowerLower(d, j, i);
5352 const auto out1 = ConcatUpperUpper(d, j, i);
5353 StoreU(out0, d, unaligned + 0 * N);
5354 StoreU(out1, d, unaligned + 1 * N);
5355}
5356
5357// Input (128-bit blocks):
5358// 3 0 (LSB of i)
5359// 4 1
5360// 5 2
5361// Output:
5362// 1 0
5363// 3 2
5364// 5 4
5365template <typename T>
5366HWY_API void StoreTransposedBlocks3(const Vec256<T> i, const Vec256<T> j,
5367 const Vec256<T> k, Full256<T> d,
5368 T* HWY_RESTRICT unaligned) {
5369 constexpr size_t N = 32 / sizeof(T);
5370 const auto out0 = ConcatLowerLower(d, j, i);
5371 const auto out1 = ConcatUpperLower(d, i, k);
5372 const auto out2 = ConcatUpperUpper(d, k, j);
5373 StoreU(out0, d, unaligned + 0 * N);
5374 StoreU(out1, d, unaligned + 1 * N);
5375 StoreU(out2, d, unaligned + 2 * N);
5376}
5377
5378// Input (128-bit blocks):
5379// 4 0 (LSB of i)
5380// 5 1
5381// 6 2
5382// 7 3
5383// Output:
5384// 1 0
5385// 3 2
5386// 5 4
5387// 7 6
5388template <typename T>
5389HWY_API void StoreTransposedBlocks4(const Vec256<T> i, const Vec256<T> j,
5390 const Vec256<T> k, const Vec256<T> l,
5391 Full256<T> d, T* HWY_RESTRICT unaligned) {
5392 constexpr size_t N = 32 / sizeof(T);
5393 // Write lower halves, then upper.
5394 const auto out0 = ConcatLowerLower(d, j, i);
5395 const auto out1 = ConcatLowerLower(d, l, k);
5396 StoreU(out0, d, unaligned + 0 * N);
5397 StoreU(out1, d, unaligned + 1 * N);
5398 const auto out2 = ConcatUpperUpper(d, j, i);
5399 const auto out3 = ConcatUpperUpper(d, l, k);
5400 StoreU(out2, d, unaligned + 2 * N);
5401 StoreU(out3, d, unaligned + 3 * N);
5402}
5403
5404} // namespace detail
5405
5406// ------------------------------ Reductions
5407
5408namespace detail {
5409
5410// Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block.
5411// Same logic as x86/128.h, but with Vec256 arguments.
5412template <typename T>
5414 const Vec256<T> v3210) {
5415 const auto v1032 = Shuffle1032(v3210);
5416 const auto v31_20_31_20 = v3210 + v1032;
5417 const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5418 return v20_31_20_31 + v31_20_31_20;
5419}
5420template <typename T>
5422 const Vec256<T> v3210) {
5423 const auto v1032 = Shuffle1032(v3210);
5424 const auto v31_20_31_20 = Min(v3210, v1032);
5425 const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5426 return Min(v20_31_20_31, v31_20_31_20);
5427}
5428template <typename T>
5430 const Vec256<T> v3210) {
5431 const auto v1032 = Shuffle1032(v3210);
5432 const auto v31_20_31_20 = Max(v3210, v1032);
5433 const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5434 return Max(v20_31_20_31, v31_20_31_20);
5435}
5436
5437template <typename T>
5439 const Vec256<T> v10) {
5440 const auto v01 = Shuffle01(v10);
5441 return v10 + v01;
5442}
5443template <typename T>
5445 const Vec256<T> v10) {
5446 const auto v01 = Shuffle01(v10);
5447 return Min(v10, v01);
5448}
5449template <typename T>
5451 const Vec256<T> v10) {
5452 const auto v01 = Shuffle01(v10);
5453 return Max(v10, v01);
5454}
5455
5458 const Full256<uint16_t> d;
5459 const RepartitionToWide<decltype(d)> d32;
5460 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5461 const auto odd = ShiftRight<16>(BitCast(d32, v));
5462 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
5463 // Also broadcast into odd lanes.
5464 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
5465}
5468 const Full256<int16_t> d;
5469 const RepartitionToWide<decltype(d)> d32;
5470 // Sign-extend
5471 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5472 const auto odd = ShiftRight<16>(BitCast(d32, v));
5473 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
5474 // Also broadcast into odd lanes.
5475 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
5476}
5477
5480 const Full256<uint16_t> d;
5481 const RepartitionToWide<decltype(d)> d32;
5482 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5483 const auto odd = ShiftRight<16>(BitCast(d32, v));
5484 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
5485 // Also broadcast into odd lanes.
5486 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5487}
5490 const Full256<int16_t> d;
5491 const RepartitionToWide<decltype(d)> d32;
5492 // Sign-extend
5493 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5494 const auto odd = ShiftRight<16>(BitCast(d32, v));
5495 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
5496 // Also broadcast into odd lanes.
5497 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5498}
5499
5502 const Full256<uint16_t> d;
5503 const RepartitionToWide<decltype(d)> d32;
5504 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5505 const auto odd = ShiftRight<16>(BitCast(d32, v));
5506 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
5507 // Also broadcast into odd lanes.
5508 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5509}
5512 const Full256<int16_t> d;
5513 const RepartitionToWide<decltype(d)> d32;
5514 // Sign-extend
5515 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5516 const auto odd = ShiftRight<16>(BitCast(d32, v));
5517 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
5518 // Also broadcast into odd lanes.
5519 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
5520}
5521
5522} // namespace detail
5523
5524// Supported for {uif}{32,64},{ui}16. Returns the broadcasted result.
5525template <typename T>
5526HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
5527 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5528 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), vLH + vHL);
5529}
5530template <typename T>
5531HWY_API Vec256<T> MinOfLanes(Full256<T> d, const Vec256<T> vHL) {
5532 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5533 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), Min(vLH, vHL));
5534}
5535template <typename T>
5536HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
5537 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5538 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
5539}
5540
5541// NOLINTNEXTLINE(google-readability-namespace-comments)
5542} // namespace HWY_NAMESPACE
5543} // namespace hwy
5545
5546// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
5547// the warning seems to be issued at the call site of intrinsics, i.e. our code.
5548HWY_DIAGNOSTICS(pop)
uint8_t buf
Definition BitIO.h:84
size_t offset
Definition BitIO.h:80
uint32_t x
Definition BlockExec.h:38
uint8_t C
Definition FileFormat.h:151
uint32_t w
Definition FileFormat.h:145
uint32_t type
Definition FileFormat.h:83
uint8_t * bits
Definition TileProcessor.h:59
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_MAYBE_UNUSED
Definition base.h:82
#define HWY_ASSERT(condition)
Definition base.h:192
Definition x86_128-inl.h:70
Raw raw
Definition arm_neon-inl.h:814
Definition x86_256-inl.h:82
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition x86_256-inl.h:109
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition x86_256-inl.h:103
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition x86_256-inl.h:100
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition x86_256-inl.h:97
Raw raw
Definition x86_256-inl.h:113
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition x86_256-inl.h:106
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition x86_256-inl.h:94
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
typename detail::Raw256< T >::type Raw
Definition x86_256-inl.h:83
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition x86_256-inl.h:91
T PrivateT
Definition wasm_256-inl.h:29
#define HWY_AVX3_DL
Definition detect_targets.h:65
#define HWY_TARGET
Definition detect_targets.h:380
uint32_t a
only used by MQ decoder
Definition mqc.h:48
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:726
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:718
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition x86_128-inl.h:5571
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:1570
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:470
HWY_INLINE Vec128< uint32_t, 2 > LookupAndConcatQuarters(Vec256< T > v)
Definition x86_256-inl.h:4175
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:963
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:3661
HWY_INLINE Vec128< uint32_t > LookupAndConcatHalves(Vec256< T > v)
Definition x86_256-inl.h:4152
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition x86_256-inl.h:2612
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition x86_128-inl.h:5009
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition x86_128-inl.h:3286
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition x86_128-inl.h:3275
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:862
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:663
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:4543
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6162
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:671
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:760
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition generic_ops-inl.h:340
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
static bool SignBit(float f)
Definition scalar-inl.h:601
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
Simd< T, 32/sizeof(T), 0 > Full256
Definition wasm_128-inl.h:54
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
Simd< T, 8/sizeof(T), 0 > Full64
Definition ops/shared-inl.h:240
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition ops/shared-inl.h:271
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
long long int GatherIndex64
Definition x86_128-inl.h:3268
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
constexpr MakeSigned< T > MaxExponentTimes2()
Definition base.h:728
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
HWY_API constexpr bool IsSigned()
Definition base.h:642
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:627
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed+R::is_float)<< 8)>
Definition base.h:619
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
constexpr int MantissaBits()
Definition base.h:712
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:796
constexpr MakeSigned< T > MaxExponentField()
Definition base.h:778
HWY_API constexpr bool IsFloat()
Definition base.h:635
HWY_API constexpr T LimitsMax()
Definition base.h:656
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
#define HWY_ATTR
Definition set_macros-inl.h:443
Definition x86_128-inl.h:6137
Definition x86_256-inl.h:3022
__m256i raw
Definition x86_256-inl.h:3023
Definition x86_256-inl.h:143
typename detail::RawMask256< sizeof(T)>::type Raw
Definition x86_256-inl.h:144
static Mask256< T > FromBits(uint64_t mask_bits)
Definition x86_256-inl.h:146
Raw raw
Definition x86_256-inl.h:150
Definition ops/shared-inl.h:52
HWY_INLINE __m256d operator()(__m256i v)
Definition x86_256-inl.h:192
HWY_INLINE __m256 operator()(__m256i v)
Definition x86_256-inl.h:188
HWY_INLINE __m256i operator()(__m256i v)
Definition x86_256-inl.h:184
__m256d type
Definition x86_256-inl.h:76
__m256 type
Definition x86_256-inl.h:72
Definition x86_256-inl.h:67
__m256i type
Definition x86_256-inl.h:68
__mmask32 type
Definition x86_256-inl.h:125
__mmask16 type
Definition x86_256-inl.h:129
__mmask8 type
Definition x86_256-inl.h:133
__mmask8 type
Definition x86_256-inl.h:137
Definition x86_256-inl.h:122
Definition base.h:435
uint32_t x1
Definition t1_common.h:75
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()