Grok 10.0.5
x86_512-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 512-bit AVX512 vectors and operations.
17// External include guard in highway.h - see comment there.
18
19// WARNING: most operations do not cross 128-bit block boundaries. In
20// particular, "Broadcast", pack and zip behavior may be surprising.
21
22// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
23#include "hwy/base.h"
24
25// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
26// https://github.com/google/highway/issues/710)
28#if HWY_COMPILER_GCC_ACTUAL
29HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
30HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
31#endif
32
33#include <immintrin.h> // AVX2+
34
35#if HWY_COMPILER_CLANGCL
36// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
37// including these headers when _MSC_VER is defined, like when using clang-cl.
38// Include these directly here.
39// clang-format off
40#include <smmintrin.h>
41
42#include <avxintrin.h>
43#include <avx2intrin.h>
44#include <f16cintrin.h>
45#include <fmaintrin.h>
46
47#include <avx512fintrin.h>
48#include <avx512vlintrin.h>
49#include <avx512bwintrin.h>
50#include <avx512dqintrin.h>
51#include <avx512vlbwintrin.h>
52#include <avx512vldqintrin.h>
53#include <avx512bitalgintrin.h>
54#include <avx512vlbitalgintrin.h>
55#include <avx512vpopcntdqintrin.h>
56#include <avx512vpopcntdqvlintrin.h>
57// clang-format on
58#endif // HWY_COMPILER_CLANGCL
59
60#include <stddef.h>
61#include <stdint.h>
62
63#if HWY_IS_MSAN
64#include <sanitizer/msan_interface.h>
65#endif
66
67// For half-width vectors. Already includes base.h and shared-inl.h.
68#include "hwy/ops/x86_256-inl.h"
69
71namespace hwy {
72namespace HWY_NAMESPACE {
73
74namespace detail {
75
76template <typename T>
77struct Raw512 {
78 using type = __m512i;
79};
80template <>
81struct Raw512<float> {
82 using type = __m512;
83};
84template <>
85struct Raw512<double> {
86 using type = __m512d;
87};
88
89// Template arg: sizeof(lane type)
90template <size_t size>
91struct RawMask512 {};
92template <>
93struct RawMask512<1> {
94 using type = __mmask64;
95};
96template <>
97struct RawMask512<2> {
98 using type = __mmask32;
99};
100template <>
101struct RawMask512<4> {
102 using type = __mmask16;
103};
104template <>
105struct RawMask512<8> {
106 using type = __mmask8;
107};
108
109} // namespace detail
110
111template <typename T>
112class Vec512 {
113 using Raw = typename detail::Raw512<T>::type;
114
115 public:
116 using PrivateT = T; // only for DFromV
117 static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromV
118
119 // Compound assignment. Only usable if there is a corresponding non-member
120 // binary operator overload. For example, only f32 and f64 support division.
122 return *this = (*this * other);
123 }
125 return *this = (*this / other);
126 }
128 return *this = (*this + other);
129 }
131 return *this = (*this - other);
132 }
134 return *this = (*this & other);
135 }
137 return *this = (*this | other);
138 }
140 return *this = (*this ^ other);
141 }
142
144};
145
146// Mask register: one bit per lane.
147template <typename T>
148struct Mask512 {
149 using Raw = typename detail::RawMask512<sizeof(T)>::type;
151};
152
153template <typename T>
154using Full512 = Simd<T, 64 / sizeof(T), 0>;
155
156// ------------------------------ BitCast
157
158namespace detail {
159
160HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; }
161HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); }
162HWY_INLINE __m512i BitCastToInteger(__m512d v) {
163 return _mm512_castpd_si512(v);
164}
165
166template <typename T>
170
171// Cannot rely on function overloading because return types differ.
172template <typename T>
174 HWY_INLINE __m512i operator()(__m512i v) { return v; }
175};
176template <>
178 HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); }
179};
180template <>
181struct BitCastFromInteger512<double> {
182 HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); }
183};
184
185template <typename T>
189
190} // namespace detail
191
192template <typename T, typename FromT>
196
197// ------------------------------ Set
198
199// Returns an all-zero vector.
200template <typename T>
202 return Vec512<T>{_mm512_setzero_si512()};
203}
205 return Vec512<float>{_mm512_setzero_ps()};
206}
208 return Vec512<double>{_mm512_setzero_pd()};
209}
210
211// Returns a vector with all lanes set to "t".
212HWY_API Vec512<uint8_t> Set(Full512<uint8_t> /* tag */, const uint8_t t) {
213 return Vec512<uint8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
214}
215HWY_API Vec512<uint16_t> Set(Full512<uint16_t> /* tag */, const uint16_t t) {
216 return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
217}
218HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
219 return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
220}
221HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
222 return Vec512<uint64_t>{
223 _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
224}
225HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
226 return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
227}
228HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
229 return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
230}
231HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
232 return Vec512<int32_t>{_mm512_set1_epi32(t)};
233}
234HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
235 return Vec512<int64_t>{
236 _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
237}
238HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
239 return Vec512<float>{_mm512_set1_ps(t)};
240}
241HWY_API Vec512<double> Set(Full512<double> /* tag */, const double t) {
242 return Vec512<double>{_mm512_set1_pd(t)};
243}
244
245HWY_DIAGNOSTICS(push)
246HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
247
248// Returns a vector with uninitialized elements.
249template <typename T>
251 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
252 // generate an XOR instruction.
253 return Vec512<T>{_mm512_undefined_epi32()};
254}
256 return Vec512<float>{_mm512_undefined_ps()};
257}
259 return Vec512<double>{_mm512_undefined_pd()};
260}
261
263
264// ================================================== LOGICAL
265
266// ------------------------------ Not
267
268template <typename T>
270 using TU = MakeUnsigned<T>;
271 const __m512i vu = BitCast(Full512<TU>(), v).raw;
272 return BitCast(Full512<T>(),
273 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
274}
275
276// ------------------------------ And
277
278template <typename T>
280 return Vec512<T>{_mm512_and_si512(a.raw, b.raw)};
281}
282
284 return Vec512<float>{_mm512_and_ps(a.raw, b.raw)};
285}
287 return Vec512<double>{_mm512_and_pd(a.raw, b.raw)};
288}
289
290// ------------------------------ AndNot
291
292// Returns ~not_mask & mask.
293template <typename T>
294HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
295 return Vec512<T>{_mm512_andnot_si512(not_mask.raw, mask.raw)};
296}
298 const Vec512<float> mask) {
299 return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)};
300}
302 const Vec512<double> mask) {
303 return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)};
304}
305
306// ------------------------------ Or
307
308template <typename T>
310 return Vec512<T>{_mm512_or_si512(a.raw, b.raw)};
311}
312
314 return Vec512<float>{_mm512_or_ps(a.raw, b.raw)};
315}
317 return Vec512<double>{_mm512_or_pd(a.raw, b.raw)};
318}
319
320// ------------------------------ Xor
321
322template <typename T>
324 return Vec512<T>{_mm512_xor_si512(a.raw, b.raw)};
325}
326
328 return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)};
329}
331 return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
332}
333
334// ------------------------------ Xor3
335template <typename T>
337 const Full512<T> d;
338 const RebindToUnsigned<decltype(d)> du;
339 using VU = VFromD<decltype(du)>;
340 const __m512i ret = _mm512_ternarylogic_epi64(
341 BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
342 return BitCast(d, VU{ret});
343}
344
345// ------------------------------ Or3
346template <typename T>
348 const Full512<T> d;
349 const RebindToUnsigned<decltype(d)> du;
350 using VU = VFromD<decltype(du)>;
351 const __m512i ret = _mm512_ternarylogic_epi64(
352 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
353 return BitCast(d, VU{ret});
354}
355
356// ------------------------------ OrAnd
357template <typename T>
359 const Full512<T> d;
360 const RebindToUnsigned<decltype(d)> du;
361 using VU = VFromD<decltype(du)>;
362 const __m512i ret = _mm512_ternarylogic_epi64(
363 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
364 return BitCast(d, VU{ret});
365}
366
367// ------------------------------ IfVecThenElse
368template <typename T>
370 const Full512<T> d;
371 const RebindToUnsigned<decltype(d)> du;
372 using VU = VFromD<decltype(du)>;
373 return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
374 BitCast(du, yes).raw,
375 BitCast(du, no).raw, 0xCA)});
376}
377
378// ------------------------------ Operator overloads (internal-only if float)
379
380template <typename T>
382 return And(a, b);
383}
384
385template <typename T>
387 return Or(a, b);
388}
389
390template <typename T>
392 return Xor(a, b);
393}
394
395// ------------------------------ PopulationCount
396
397// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
398#if HWY_TARGET == HWY_AVX3_DL
399
400#ifdef HWY_NATIVE_POPCNT
401#undef HWY_NATIVE_POPCNT
402#else
403#define HWY_NATIVE_POPCNT
404#endif
405
406namespace detail {
407
408template <typename T>
410 return Vec512<T>{_mm512_popcnt_epi8(v.raw)};
411}
412template <typename T>
414 return Vec512<T>{_mm512_popcnt_epi16(v.raw)};
415}
416template <typename T>
418 return Vec512<T>{_mm512_popcnt_epi32(v.raw)};
419}
420template <typename T>
422 return Vec512<T>{_mm512_popcnt_epi64(v.raw)};
423}
424
425} // namespace detail
426
427template <typename T>
431
432#endif // HWY_TARGET == HWY_AVX3_DL
433
434// ================================================== SIGN
435
436// ------------------------------ CopySign
437
438template <typename T>
440 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
441
442 const Full512<T> d;
443 const auto msb = SignBit(d);
444
445 const Rebind<MakeUnsigned<T>, decltype(d)> du;
446 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
447 // 0 0 0 | 0
448 // 0 0 1 | 0
449 // 0 1 0 | 1
450 // 0 1 1 | 1
451 // 1 0 0 | 0
452 // 1 0 1 | 1
453 // 1 1 0 | 0
454 // 1 1 1 | 1
455 // The lane size does not matter because we are not using predication.
456 const __m512i out = _mm512_ternarylogic_epi32(
457 BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
458 return BitCast(d, decltype(Zero(du)){out});
459}
460
461template <typename T>
463 // AVX3 can also handle abs < 0, so no extra action needed.
464 return CopySign(abs, sign);
465}
466
467// ================================================== MASK
468
469// ------------------------------ FirstN
470
471// Possibilities for constructing a bitmask of N ones:
472// - kshift* only consider the lowest byte of the shift count, so they would
473// not correctly handle large n.
474// - Scalar shifts >= 64 are UB.
475// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
476// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
477
478#if HWY_ARCH_X86_32
479namespace detail {
480
481// 32 bit mask is sufficient for lane size >= 2.
482template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
483HWY_INLINE Mask512<T> FirstN(size_t n) {
484 Mask512<T> m;
485 const uint32_t all = ~uint32_t{0};
486 // BZHI only looks at the lower 8 bits of n!
487 m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n));
488 return m;
489}
490
491template <typename T, HWY_IF_LANE_SIZE(T, 1)>
492HWY_INLINE Mask512<T> FirstN(size_t n) {
493 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
494 return Mask512<T>{static_cast<__mmask64>(bits)};
495}
496
497} // namespace detail
498#endif // HWY_ARCH_X86_32
499
500template <typename T>
501HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
502#if HWY_ARCH_X86_64
503 Mask512<T> m;
504 const uint64_t all = ~uint64_t{0};
505 // BZHI only looks at the lower 8 bits of n!
506 m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n));
507 return m;
508#else
509 return detail::FirstN<T>(n);
510#endif // HWY_ARCH_X86_64
511}
512
513// ------------------------------ IfThenElse
514
515// Returns mask ? b : a.
516
517namespace detail {
518
519// Templates for signed/unsigned integer of a particular size.
520template <typename T>
522 const Mask512<T> mask, const Vec512<T> yes,
523 const Vec512<T> no) {
524 return Vec512<T>{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
525}
526template <typename T>
528 const Mask512<T> mask, const Vec512<T> yes,
529 const Vec512<T> no) {
530 return Vec512<T>{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
531}
532template <typename T>
534 const Mask512<T> mask, const Vec512<T> yes,
535 const Vec512<T> no) {
536 return Vec512<T>{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
537}
538template <typename T>
540 const Mask512<T> mask, const Vec512<T> yes,
541 const Vec512<T> no) {
542 return Vec512<T>{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
543}
544
545} // namespace detail
546
547template <typename T>
549 const Vec512<T> no) {
550 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
551}
553 const Vec512<float> yes,
554 const Vec512<float> no) {
555 return Vec512<float>{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)};
556}
558 const Vec512<double> yes,
559 const Vec512<double> no) {
560 return Vec512<double>{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)};
561}
562
563namespace detail {
564
565template <typename T>
567 const Mask512<T> mask,
568 const Vec512<T> yes) {
569 return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)};
570}
571template <typename T>
573 const Mask512<T> mask,
574 const Vec512<T> yes) {
575 return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)};
576}
577template <typename T>
579 const Mask512<T> mask,
580 const Vec512<T> yes) {
581 return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)};
582}
583template <typename T>
585 const Mask512<T> mask,
586 const Vec512<T> yes) {
587 return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)};
588}
589
590} // namespace detail
591
592template <typename T>
594 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
595}
597 const Vec512<float> yes) {
598 return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)};
599}
601 const Vec512<double> yes) {
602 return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)};
603}
604
605namespace detail {
606
607template <typename T>
609 const Mask512<T> mask, const Vec512<T> no) {
610 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
611 return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
612}
613template <typename T>
615 const Mask512<T> mask, const Vec512<T> no) {
616 return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
617}
618template <typename T>
620 const Mask512<T> mask, const Vec512<T> no) {
621 return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
622}
623template <typename T>
625 const Mask512<T> mask, const Vec512<T> no) {
626 return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
627}
628
629} // namespace detail
630
631template <typename T>
633 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
634}
636 const Vec512<float> no) {
637 return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
638}
640 const Vec512<double> no) {
641 return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
642}
643
644template <typename T>
646 static_assert(IsSigned<T>(), "Only works for signed/float");
647 // AVX3 MaskFromVec only looks at the MSB
648 return IfThenElse(MaskFromVec(v), yes, no);
649}
650
651template <typename T, HWY_IF_FLOAT(T)>
653 // AVX3 MaskFromVec only looks at the MSB
654 return IfThenZeroElse(MaskFromVec(v), v);
655}
656
657// ================================================== ARITHMETIC
658
659// ------------------------------ Addition
660
661// Unsigned
663 const Vec512<uint8_t> b) {
664 return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)};
665}
667 const Vec512<uint16_t> b) {
668 return Vec512<uint16_t>{_mm512_add_epi16(a.raw, b.raw)};
669}
671 const Vec512<uint32_t> b) {
672 return Vec512<uint32_t>{_mm512_add_epi32(a.raw, b.raw)};
673}
675 const Vec512<uint64_t> b) {
676 return Vec512<uint64_t>{_mm512_add_epi64(a.raw, b.raw)};
677}
678
679// Signed
681 const Vec512<int8_t> b) {
682 return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)};
683}
685 const Vec512<int16_t> b) {
686 return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)};
687}
689 const Vec512<int32_t> b) {
690 return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)};
691}
693 const Vec512<int64_t> b) {
694 return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)};
695}
696
697// Float
699 return Vec512<float>{_mm512_add_ps(a.raw, b.raw)};
700}
702 const Vec512<double> b) {
703 return Vec512<double>{_mm512_add_pd(a.raw, b.raw)};
704}
705
706// ------------------------------ Subtraction
707
708// Unsigned
710 const Vec512<uint8_t> b) {
711 return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)};
712}
714 const Vec512<uint16_t> b) {
715 return Vec512<uint16_t>{_mm512_sub_epi16(a.raw, b.raw)};
716}
718 const Vec512<uint32_t> b) {
719 return Vec512<uint32_t>{_mm512_sub_epi32(a.raw, b.raw)};
720}
722 const Vec512<uint64_t> b) {
723 return Vec512<uint64_t>{_mm512_sub_epi64(a.raw, b.raw)};
724}
725
726// Signed
728 const Vec512<int8_t> b) {
729 return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)};
730}
732 const Vec512<int16_t> b) {
733 return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)};
734}
736 const Vec512<int32_t> b) {
737 return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)};
738}
740 const Vec512<int64_t> b) {
741 return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)};
742}
743
744// Float
746 return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)};
747}
749 const Vec512<double> b) {
750 return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
751}
752
753// ------------------------------ SumsOf8
755 return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
756}
757
758// ------------------------------ SaturatedAdd
759
760// Returns a + b clamped to the destination range.
761
762// Unsigned
764 const Vec512<uint8_t> b) {
765 return Vec512<uint8_t>{_mm512_adds_epu8(a.raw, b.raw)};
766}
768 const Vec512<uint16_t> b) {
769 return Vec512<uint16_t>{_mm512_adds_epu16(a.raw, b.raw)};
770}
771
772// Signed
774 const Vec512<int8_t> b) {
775 return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)};
776}
778 const Vec512<int16_t> b) {
779 return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
780}
781
782// ------------------------------ SaturatedSub
783
784// Returns a - b clamped to the destination range.
785
786// Unsigned
788 const Vec512<uint8_t> b) {
789 return Vec512<uint8_t>{_mm512_subs_epu8(a.raw, b.raw)};
790}
792 const Vec512<uint16_t> b) {
793 return Vec512<uint16_t>{_mm512_subs_epu16(a.raw, b.raw)};
794}
795
796// Signed
798 const Vec512<int8_t> b) {
799 return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)};
800}
802 const Vec512<int16_t> b) {
803 return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)};
804}
805
806// ------------------------------ Average
807
808// Returns (a + b + 1) / 2
809
810// Unsigned
812 const Vec512<uint8_t> b) {
813 return Vec512<uint8_t>{_mm512_avg_epu8(a.raw, b.raw)};
814}
816 const Vec512<uint16_t> b) {
817 return Vec512<uint16_t>{_mm512_avg_epu16(a.raw, b.raw)};
818}
819
820// ------------------------------ Abs (Sub)
821
822// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
824#if HWY_COMPILER_MSVC
825 // Workaround for incorrect codegen? (untested due to internal compiler error)
826 const auto zero = Zero(Full512<int8_t>());
827 return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
828#else
829 return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
830#endif
831}
833 return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
834}
836 return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
837}
839 return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
840}
841
842// These aren't native instructions, they also involve AND with constant.
844 return Vec512<float>{_mm512_abs_ps(v.raw)};
845}
847 return Vec512<double>{_mm512_abs_pd(v.raw)};
848}
849// ------------------------------ ShiftLeft
850
851template <int kBits>
853 return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)};
854}
855
856template <int kBits>
858 return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)};
859}
860
861template <int kBits>
863 return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)};
864}
865
866template <int kBits>
868 return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)};
869}
870
871template <int kBits>
873 return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)};
874}
875
876template <int kBits>
878 return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
879}
880
881template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
883 const Full512<T> d8;
884 const RepartitionToWide<decltype(d8)> d16;
885 const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
886 return kBits == 1
887 ? (v + v)
888 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
889}
890
891// ------------------------------ ShiftRight
892
893template <int kBits>
895 return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)};
896}
897
898template <int kBits>
900 return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)};
901}
902
903template <int kBits>
905 return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)};
906}
907
908template <int kBits>
910 const Full512<uint8_t> d8;
911 // Use raw instead of BitCast to support N=1.
912 const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
913 return shifted & Set(d8, 0xFF >> kBits);
914}
915
916template <int kBits>
918 return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
919}
920
921template <int kBits>
923 return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)};
924}
925
926template <int kBits>
928 return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
929}
930
931template <int kBits>
933 const Full512<int8_t> di;
934 const Full512<uint8_t> du;
935 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
936 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
937 return (shifted ^ shifted_sign) - shifted_sign;
938}
939
940// ------------------------------ RotateRight
941
942template <int kBits>
944 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
945 return Vec512<uint32_t>{_mm512_ror_epi32(v.raw, kBits)};
946}
947
948template <int kBits>
950 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
951 return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
952}
953
954// ------------------------------ ShiftLeftSame
955
957 const int bits) {
958 return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
959}
961 const int bits) {
962 return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
963}
965 const int bits) {
966 return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
967}
968
970 return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
971}
972
974 return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
975}
976
978 return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
979}
980
981template <typename T, HWY_IF_LANE_SIZE(T, 1)>
983 const Full512<T> d8;
984 const RepartitionToWide<decltype(d8)> d16;
985 const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
986 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
987}
988
989// ------------------------------ ShiftRightSame
990
992 const int bits) {
993 return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
994}
996 const int bits) {
997 return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
998}
1000 const int bits) {
1001 return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1002}
1003
1005 const Full512<uint8_t> d8;
1006 const RepartitionToWide<decltype(d8)> d16;
1007 const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
1008 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
1009}
1010
1012 const int bits) {
1013 return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1014}
1015
1017 const int bits) {
1018 return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1019}
1021 const int bits) {
1022 return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1023}
1024
1026 const Full512<int8_t> di;
1027 const Full512<uint8_t> du;
1028 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1029 const auto shifted_sign =
1030 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1031 return (shifted ^ shifted_sign) - shifted_sign;
1032}
1033
1034// ------------------------------ Shl
1035
1037 const Vec512<uint16_t> bits) {
1038 return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
1039}
1040
1042 const Vec512<uint32_t> bits) {
1043 return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)};
1044}
1045
1047 const Vec512<uint64_t> bits) {
1048 return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)};
1049}
1050
1051// Signed left shift is the same as unsigned.
1052template <typename T, HWY_IF_SIGNED(T)>
1054 const Full512<T> di;
1055 const Full512<MakeUnsigned<T>> du;
1056 return BitCast(di, BitCast(du, v) << BitCast(du, bits));
1057}
1058
1059// ------------------------------ Shr
1060
1062 const Vec512<uint16_t> bits) {
1063 return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)};
1064}
1065
1067 const Vec512<uint32_t> bits) {
1068 return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)};
1069}
1070
1072 const Vec512<uint64_t> bits) {
1073 return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)};
1074}
1075
1077 const Vec512<int16_t> bits) {
1078 return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)};
1079}
1080
1082 const Vec512<int32_t> bits) {
1083 return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)};
1084}
1085
1087 const Vec512<int64_t> bits) {
1088 return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
1089}
1090
1091// ------------------------------ Minimum
1092
1093// Unsigned
1095 return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)};
1096}
1098 const Vec512<uint16_t> b) {
1099 return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)};
1100}
1102 const Vec512<uint32_t> b) {
1103 return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)};
1104}
1106 const Vec512<uint64_t> b) {
1107 return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)};
1108}
1109
1110// Signed
1112 return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)};
1113}
1115 return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)};
1116}
1118 return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)};
1119}
1121 return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)};
1122}
1123
1124// Float
1126 return Vec512<float>{_mm512_min_ps(a.raw, b.raw)};
1127}
1129 return Vec512<double>{_mm512_min_pd(a.raw, b.raw)};
1130}
1131
1132// ------------------------------ Maximum
1133
1134// Unsigned
1136 return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)};
1137}
1139 const Vec512<uint16_t> b) {
1140 return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)};
1141}
1143 const Vec512<uint32_t> b) {
1144 return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)};
1145}
1147 const Vec512<uint64_t> b) {
1148 return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)};
1149}
1150
1151// Signed
1153 return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)};
1154}
1156 return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)};
1157}
1159 return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)};
1160}
1162 return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)};
1163}
1164
1165// Float
1167 return Vec512<float>{_mm512_max_ps(a.raw, b.raw)};
1168}
1170 return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
1171}
1172
1173// ------------------------------ Integer multiplication
1174
1175// Unsigned
1177 return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1178}
1180 return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1181}
1183 return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
1184}
1186 return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
1187}
1191
1192// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
1193#ifdef HWY_NATIVE_I64MULLO
1194#undef HWY_NATIVE_I64MULLO
1195#else
1196#define HWY_NATIVE_I64MULLO
1197#endif
1198
1199// Signed
1201 return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1202}
1204 return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1205}
1207 return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
1208}
1210 return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
1211}
1213 return Vec128<int64_t>{_mm_mullo_epi64(a.raw, b.raw)};
1214}
1215// Returns the upper 16 bits of a * b in each lane.
1217 return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
1218}
1220 return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)};
1221}
1222
1224 return Vec512<int16_t>{_mm512_mulhrs_epi16(a.raw, b.raw)};
1225}
1226
1227// Multiplies even lanes (0, 2 ..) and places the double-wide result into
1228// even and the upper half into its odd neighbor lane.
1230 return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)};
1231}
1233 return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)};
1234}
1235
1236// ------------------------------ Neg (Sub)
1237
1238template <typename T, HWY_IF_FLOAT(T)>
1240 return Xor(v, SignBit(Full512<T>()));
1241}
1242
1243template <typename T, HWY_IF_NOT_FLOAT(T)>
1244HWY_API Vec512<T> Neg(const Vec512<T> v) {
1245 return Zero(Full512<T>()) - v;
1246}
1247
1248// ------------------------------ Floating-point mul / div
1249
1251 return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)};
1252}
1254 const Vec512<double> b) {
1255 return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
1256}
1257
1259 return Vec512<float>{_mm512_div_ps(a.raw, b.raw)};
1260}
1262 const Vec512<double> b) {
1263 return Vec512<double>{_mm512_div_pd(a.raw, b.raw)};
1264}
1265
1266// Approximate reciprocal
1268 return Vec512<float>{_mm512_rcp14_ps(v.raw)};
1269}
1270
1271// Absolute value of difference.
1273 return Abs(a - b);
1274}
1275
1276// ------------------------------ Floating-point multiply-add variants
1277
1278// Returns mul * x + add
1280 const Vec512<float> add) {
1281 return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)};
1282}
1284 const Vec512<double> add) {
1285 return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)};
1286}
1287
1288// Returns add - mul * x
1290 const Vec512<float> add) {
1291 return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)};
1292}
1294 const Vec512<double> x,
1295 const Vec512<double> add) {
1296 return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)};
1297}
1298
1299// Returns mul * x - sub
1301 const Vec512<float> sub) {
1302 return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)};
1303}
1305 const Vec512<double> sub) {
1306 return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)};
1307}
1308
1309// Returns -mul * x - sub
1311 const Vec512<float> sub) {
1312 return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1313}
1315 const Vec512<double> x,
1316 const Vec512<double> sub) {
1317 return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1318}
1319
1320// ------------------------------ Floating-point square root
1321
1322// Full precision square root
1324 return Vec512<float>{_mm512_sqrt_ps(v.raw)};
1325}
1327 return Vec512<double>{_mm512_sqrt_pd(v.raw)};
1328}
1329
1330// Approximate reciprocal square root
1332 return Vec512<float>{_mm512_rsqrt14_ps(v.raw)};
1333}
1334
1335// ------------------------------ Floating-point rounding
1336
1337// Work around warnings in the intrinsic definitions (passing -1 as a mask).
1338HWY_DIAGNOSTICS(push)
1339HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1340
1341// Toward nearest integer, tie to even
1342HWY_API Vec512<float> Round(const Vec512<float> v) {
1343 return Vec512<float>{_mm512_roundscale_ps(
1344 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1345}
1347 return Vec512<double>{_mm512_roundscale_pd(
1348 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1349}
1350
1351// Toward zero, aka truncate
1353 return Vec512<float>{
1354 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1355}
1357 return Vec512<double>{
1358 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1359}
1360
1361// Toward +infinity, aka ceiling
1363 return Vec512<float>{
1364 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1365}
1367 return Vec512<double>{
1368 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1369}
1370
1371// Toward -infinity, aka floor
1373 return Vec512<float>{
1374 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1375}
1377 return Vec512<double>{
1378 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1379}
1380
1381HWY_DIAGNOSTICS(pop)
1382
1383// ================================================== COMPARE
1384
1385// Comparisons set a mask bit to 1 if the condition is true, else 0.
1386
1387template <typename TFrom, typename TTo>
1389 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1390 return Mask512<TTo>{m.raw};
1391}
1392
1393namespace detail {
1394
1395template <typename T>
1397 const Vec512<T> bit) {
1398 return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)};
1399}
1400template <typename T>
1402 const Vec512<T> bit) {
1403 return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)};
1404}
1405template <typename T>
1407 const Vec512<T> bit) {
1408 return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)};
1409}
1410template <typename T>
1412 const Vec512<T> bit) {
1413 return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)};
1414}
1415
1416} // namespace detail
1417
1418template <typename T>
1420 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1421 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1422}
1423
1424// ------------------------------ Equality
1425
1426template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1428 return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)};
1429}
1430template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1431HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1432 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1433}
1434template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1435HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1436 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1437}
1438template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1439HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1440 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1441}
1442
1444 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1445}
1446
1448 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1449}
1450
1451// ------------------------------ Inequality
1452
1453template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1455 return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)};
1456}
1457template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1458HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1459 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1460}
1461template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1462HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1463 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1464}
1465template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1466HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1467 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1468}
1469
1471 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1472}
1473
1475 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1476}
1477
1478// ------------------------------ Strict inequality
1479
1481 return Mask512<uint8_t>{_mm512_cmpgt_epu8_mask(a.raw, b.raw)};
1482}
1484 return Mask512<uint16_t>{_mm512_cmpgt_epu16_mask(a.raw, b.raw)};
1485}
1487 return Mask512<uint32_t>{_mm512_cmpgt_epu32_mask(a.raw, b.raw)};
1488}
1490 return Mask512<uint64_t>{_mm512_cmpgt_epu64_mask(a.raw, b.raw)};
1491}
1492
1494 return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)};
1495}
1497 return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)};
1498}
1500 return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)};
1501}
1503 return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)};
1504}
1505
1507 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1508}
1510 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1511}
1512
1513// ------------------------------ Weak inequality
1514
1516 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1517}
1519 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1520}
1521
1522// ------------------------------ Reversed comparisons
1523
1524template <typename T>
1526 return b > a;
1527}
1528
1529template <typename T>
1531 return b >= a;
1532}
1533
1534// ------------------------------ Mask
1535
1536namespace detail {
1537
1538template <typename T>
1540 return Mask512<T>{_mm512_movepi8_mask(v.raw)};
1541}
1542template <typename T>
1544 return Mask512<T>{_mm512_movepi16_mask(v.raw)};
1545}
1546template <typename T>
1548 return Mask512<T>{_mm512_movepi32_mask(v.raw)};
1549}
1550template <typename T>
1552 return Mask512<T>{_mm512_movepi64_mask(v.raw)};
1553}
1554
1555} // namespace detail
1556
1557template <typename T>
1559 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1560}
1561// There do not seem to be native floating-point versions of these instructions.
1568
1570 return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
1571}
1573 return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
1574}
1575
1577 return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
1578}
1580 return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
1581}
1582
1584 return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
1585}
1587 return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
1588}
1590 return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
1591}
1592
1594 return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
1595}
1597 return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
1598}
1600 return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
1601}
1602
1603template <typename T>
1605 return VecFromMask(v);
1606}
1607
1608// ------------------------------ Mask logical
1609
1610namespace detail {
1611
1612template <typename T>
1614#if HWY_COMPILER_HAS_MASK_INTRINSICS
1615 return Mask512<T>{_knot_mask64(m.raw)};
1616#else
1617 return Mask512<T>{~m.raw};
1618#endif
1619}
1620template <typename T>
1622#if HWY_COMPILER_HAS_MASK_INTRINSICS
1623 return Mask512<T>{_knot_mask32(m.raw)};
1624#else
1625 return Mask512<T>{~m.raw};
1626#endif
1627}
1628template <typename T>
1630#if HWY_COMPILER_HAS_MASK_INTRINSICS
1631 return Mask512<T>{_knot_mask16(m.raw)};
1632#else
1633 return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)};
1634#endif
1635}
1636template <typename T>
1638#if HWY_COMPILER_HAS_MASK_INTRINSICS
1639 return Mask512<T>{_knot_mask8(m.raw)};
1640#else
1641 return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)};
1642#endif
1643}
1644
1645template <typename T>
1647 const Mask512<T> b) {
1648#if HWY_COMPILER_HAS_MASK_INTRINSICS
1649 return Mask512<T>{_kand_mask64(a.raw, b.raw)};
1650#else
1651 return Mask512<T>{a.raw & b.raw};
1652#endif
1653}
1654template <typename T>
1656 const Mask512<T> b) {
1657#if HWY_COMPILER_HAS_MASK_INTRINSICS
1658 return Mask512<T>{_kand_mask32(a.raw, b.raw)};
1659#else
1660 return Mask512<T>{a.raw & b.raw};
1661#endif
1662}
1663template <typename T>
1665 const Mask512<T> b) {
1666#if HWY_COMPILER_HAS_MASK_INTRINSICS
1667 return Mask512<T>{_kand_mask16(a.raw, b.raw)};
1668#else
1669 return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)};
1670#endif
1671}
1672template <typename T>
1674 const Mask512<T> b) {
1675#if HWY_COMPILER_HAS_MASK_INTRINSICS
1676 return Mask512<T>{_kand_mask8(a.raw, b.raw)};
1677#else
1678 return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)};
1679#endif
1680}
1681
1682template <typename T>
1684 const Mask512<T> b) {
1685#if HWY_COMPILER_HAS_MASK_INTRINSICS
1686 return Mask512<T>{_kandn_mask64(a.raw, b.raw)};
1687#else
1688 return Mask512<T>{~a.raw & b.raw};
1689#endif
1690}
1691template <typename T>
1693 const Mask512<T> b) {
1694#if HWY_COMPILER_HAS_MASK_INTRINSICS
1695 return Mask512<T>{_kandn_mask32(a.raw, b.raw)};
1696#else
1697 return Mask512<T>{~a.raw & b.raw};
1698#endif
1699}
1700template <typename T>
1702 const Mask512<T> b) {
1703#if HWY_COMPILER_HAS_MASK_INTRINSICS
1704 return Mask512<T>{_kandn_mask16(a.raw, b.raw)};
1705#else
1706 return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)};
1707#endif
1708}
1709template <typename T>
1711 const Mask512<T> b) {
1712#if HWY_COMPILER_HAS_MASK_INTRINSICS
1713 return Mask512<T>{_kandn_mask8(a.raw, b.raw)};
1714#else
1715 return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)};
1716#endif
1717}
1718
1719template <typename T>
1721 const Mask512<T> b) {
1722#if HWY_COMPILER_HAS_MASK_INTRINSICS
1723 return Mask512<T>{_kor_mask64(a.raw, b.raw)};
1724#else
1725 return Mask512<T>{a.raw | b.raw};
1726#endif
1727}
1728template <typename T>
1730 const Mask512<T> b) {
1731#if HWY_COMPILER_HAS_MASK_INTRINSICS
1732 return Mask512<T>{_kor_mask32(a.raw, b.raw)};
1733#else
1734 return Mask512<T>{a.raw | b.raw};
1735#endif
1736}
1737template <typename T>
1739 const Mask512<T> b) {
1740#if HWY_COMPILER_HAS_MASK_INTRINSICS
1741 return Mask512<T>{_kor_mask16(a.raw, b.raw)};
1742#else
1743 return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)};
1744#endif
1745}
1746template <typename T>
1748 const Mask512<T> b) {
1749#if HWY_COMPILER_HAS_MASK_INTRINSICS
1750 return Mask512<T>{_kor_mask8(a.raw, b.raw)};
1751#else
1752 return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)};
1753#endif
1754}
1755
1756template <typename T>
1758 const Mask512<T> b) {
1759#if HWY_COMPILER_HAS_MASK_INTRINSICS
1760 return Mask512<T>{_kxor_mask64(a.raw, b.raw)};
1761#else
1762 return Mask512<T>{a.raw ^ b.raw};
1763#endif
1764}
1765template <typename T>
1767 const Mask512<T> b) {
1768#if HWY_COMPILER_HAS_MASK_INTRINSICS
1769 return Mask512<T>{_kxor_mask32(a.raw, b.raw)};
1770#else
1771 return Mask512<T>{a.raw ^ b.raw};
1772#endif
1773}
1774template <typename T>
1776 const Mask512<T> b) {
1777#if HWY_COMPILER_HAS_MASK_INTRINSICS
1778 return Mask512<T>{_kxor_mask16(a.raw, b.raw)};
1779#else
1780 return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
1781#endif
1782}
1783template <typename T>
1785 const Mask512<T> b) {
1786#if HWY_COMPILER_HAS_MASK_INTRINSICS
1787 return Mask512<T>{_kxor_mask8(a.raw, b.raw)};
1788#else
1789 return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
1790#endif
1791}
1792
1793template <typename T>
1795 const Mask512<T> a, const Mask512<T> b) {
1796#if HWY_COMPILER_HAS_MASK_INTRINSICS
1797 return Mask512<T>{_kxnor_mask64(a.raw, b.raw)};
1798#else
1799 return Mask512<T>{~(a.raw ^ b.raw)};
1800#endif
1801}
1802template <typename T>
1804 const Mask512<T> a, const Mask512<T> b) {
1805#if HWY_COMPILER_HAS_MASK_INTRINSICS
1806 return Mask512<T>{_kxnor_mask32(a.raw, b.raw)};
1807#else
1808 return Mask512<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
1809#endif
1810}
1811template <typename T>
1813 const Mask512<T> a, const Mask512<T> b) {
1814#if HWY_COMPILER_HAS_MASK_INTRINSICS
1815 return Mask512<T>{_kxnor_mask16(a.raw, b.raw)};
1816#else
1817 return Mask512<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
1818#endif
1819}
1820template <typename T>
1822 const Mask512<T> a, const Mask512<T> b) {
1823#if HWY_COMPILER_HAS_MASK_INTRINSICS
1824 return Mask512<T>{_kxnor_mask8(a.raw, b.raw)};
1825#else
1826 return Mask512<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
1827#endif
1828}
1829
1830} // namespace detail
1831
1832template <typename T>
1834 return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1835}
1836
1837template <typename T>
1839 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1840}
1841
1842template <typename T>
1844 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1845}
1846
1847template <typename T>
1849 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1850}
1851
1852template <typename T>
1854 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1855}
1856
1857template <typename T>
1861
1862// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1863
1867
1871
1875
1877 return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
1878}
1879
1880// ------------------------------ Floating-point classification (Not)
1881
1883 return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x81)};
1884}
1886 return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x81)};
1887}
1888
1890 return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x18)};
1891}
1893 return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x18)};
1894}
1895
1896// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
1897// positive, so we have to check for inf/NaN and negate.
1899 return Not(Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x99)});
1900}
1902 return Not(Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x99)});
1903}
1904
1905// ================================================== MEMORY
1906
1907// ------------------------------ Load
1908
1909template <typename T>
1910HWY_API Vec512<T> Load(Full512<T> /* tag */, const T* HWY_RESTRICT aligned) {
1911 return Vec512<T>{_mm512_load_si512(aligned)};
1912}
1914 const float* HWY_RESTRICT aligned) {
1915 return Vec512<float>{_mm512_load_ps(aligned)};
1916}
1918 const double* HWY_RESTRICT aligned) {
1919 return Vec512<double>{_mm512_load_pd(aligned)};
1920}
1921
1922template <typename T>
1924 return Vec512<T>{_mm512_loadu_si512(p)};
1925}
1927 const float* HWY_RESTRICT p) {
1928 return Vec512<float>{_mm512_loadu_ps(p)};
1929}
1931 const double* HWY_RESTRICT p) {
1932 return Vec512<double>{_mm512_loadu_pd(p)};
1933}
1934
1935// ------------------------------ MaskedLoad
1936
1937template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1939 const T* HWY_RESTRICT p) {
1940 return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, p)};
1941}
1942
1943template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1944HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1945 const T* HWY_RESTRICT p) {
1946 return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1947}
1948
1949template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1950HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1951 const T* HWY_RESTRICT p) {
1952 return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1953}
1954
1955template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1956HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1957 const T* HWY_RESTRICT p) {
1958 return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1959}
1960
1962 const float* HWY_RESTRICT p) {
1963 return Vec512<float>{_mm512_maskz_loadu_ps(m.raw, p)};
1964}
1965
1967 const double* HWY_RESTRICT p) {
1968 return Vec512<double>{_mm512_maskz_loadu_pd(m.raw, p)};
1969}
1970
1971// ------------------------------ LoadDup128
1972
1973// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
1974// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
1975template <typename T>
1977 const T* const HWY_RESTRICT p) {
1978 const auto x4 = LoadU(Full128<T>(), p);
1979 return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1980}
1982 const float* const HWY_RESTRICT p) {
1983 const __m128 x4 = _mm_loadu_ps(p);
1984 return Vec512<float>{_mm512_broadcast_f32x4(x4)};
1985}
1986
1988 const double* const HWY_RESTRICT p) {
1989 const __m128d x2 = _mm_loadu_pd(p);
1990 return Vec512<double>{_mm512_broadcast_f64x2(x2)};
1991}
1992
1993// ------------------------------ Store
1994
1995template <typename T>
1996HWY_API void Store(const Vec512<T> v, Full512<T> /* tag */,
1997 T* HWY_RESTRICT aligned) {
1998 _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1999}
2001 float* HWY_RESTRICT aligned) {
2002 _mm512_store_ps(aligned, v.raw);
2003}
2005 double* HWY_RESTRICT aligned) {
2006 _mm512_store_pd(aligned, v.raw);
2007}
2008
2009template <typename T>
2010HWY_API void StoreU(const Vec512<T> v, Full512<T> /* tag */,
2011 T* HWY_RESTRICT p) {
2012 _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
2013}
2015 float* HWY_RESTRICT p) {
2016 _mm512_storeu_ps(p, v.raw);
2017}
2019 double* HWY_RESTRICT p) {
2020 _mm512_storeu_pd(p, v.raw);
2021}
2022
2023// ------------------------------ BlendedStore
2024
2025template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2027 T* HWY_RESTRICT p) {
2028 _mm512_mask_storeu_epi8(p, m.raw, v.raw);
2029}
2030
2031template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2032HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
2033 T* HWY_RESTRICT p) {
2034 _mm512_mask_storeu_epi16(p, m.raw, v.raw);
2035}
2036
2037template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2038HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
2039 T* HWY_RESTRICT p) {
2040 _mm512_mask_storeu_epi32(p, m.raw, v.raw);
2041}
2042
2043template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2044HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
2045 T* HWY_RESTRICT p) {
2046 _mm512_mask_storeu_epi64(p, m.raw, v.raw);
2047}
2048
2050 Full512<float> /* tag */, float* HWY_RESTRICT p) {
2051 _mm512_mask_storeu_ps(p, m.raw, v.raw);
2052}
2053
2055 Full512<double> /* tag */, double* HWY_RESTRICT p) {
2056 _mm512_mask_storeu_pd(p, m.raw, v.raw);
2057}
2058
2059// ------------------------------ Non-temporal stores
2060
2061template <typename T>
2062HWY_API void Stream(const Vec512<T> v, Full512<T> /* tag */,
2063 T* HWY_RESTRICT aligned) {
2064 _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
2065}
2067 float* HWY_RESTRICT aligned) {
2068 _mm512_stream_ps(aligned, v.raw);
2069}
2071 double* HWY_RESTRICT aligned) {
2072 _mm512_stream_pd(aligned, v.raw);
2073}
2074
2075// ------------------------------ Scatter
2076
2077// Work around warnings in the intrinsic definitions (passing -1 as a mask).
2078HWY_DIAGNOSTICS(push)
2079HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2080
2081namespace detail {
2082
2083template <typename T>
2085 Full512<T> /* tag */, T* HWY_RESTRICT base,
2086 const Vec512<int32_t> offset) {
2087 _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
2088}
2089template <typename T>
2091 Full512<T> /* tag */, T* HWY_RESTRICT base,
2092 const Vec512<int32_t> index) {
2093 _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
2094}
2095
2096template <typename T>
2098 Full512<T> /* tag */, T* HWY_RESTRICT base,
2099 const Vec512<int64_t> offset) {
2100 _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
2101}
2102template <typename T>
2104 Full512<T> /* tag */, T* HWY_RESTRICT base,
2105 const Vec512<int64_t> index) {
2106 _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
2107}
2108
2109} // namespace detail
2110
2111template <typename T, typename Offset>
2113 const Vec512<Offset> offset) {
2114 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2115 return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2116}
2117template <typename T, typename Index>
2119 const Vec512<Index> index) {
2120 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2121 return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2122}
2123
2125 float* HWY_RESTRICT base,
2126 const Vec512<int32_t> offset) {
2127 _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
2128}
2130 float* HWY_RESTRICT base,
2131 const Vec512<int32_t> index) {
2132 _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
2133}
2134
2136 double* HWY_RESTRICT base,
2137 const Vec512<int64_t> offset) {
2138 _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
2139}
2141 double* HWY_RESTRICT base,
2142 const Vec512<int64_t> index) {
2143 _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
2144}
2145
2146// ------------------------------ Gather
2147
2148namespace detail {
2149
2150template <typename T>
2152 Full512<T> /* tag */,
2153 const T* HWY_RESTRICT base,
2154 const Vec512<int32_t> offset) {
2155 return Vec512<T>{_mm512_i32gather_epi32(offset.raw, base, 1)};
2156}
2157template <typename T>
2159 Full512<T> /* tag */,
2160 const T* HWY_RESTRICT base,
2161 const Vec512<int32_t> index) {
2162 return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, 4)};
2163}
2164
2165template <typename T>
2167 Full512<T> /* tag */,
2168 const T* HWY_RESTRICT base,
2169 const Vec512<int64_t> offset) {
2170 return Vec512<T>{_mm512_i64gather_epi64(offset.raw, base, 1)};
2171}
2172template <typename T>
2174 Full512<T> /* tag */,
2175 const T* HWY_RESTRICT base,
2176 const Vec512<int64_t> index) {
2177 return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, 8)};
2178}
2179
2180} // namespace detail
2181
2182template <typename T, typename Offset>
2184 const Vec512<Offset> offset) {
2185 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2186 return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2187}
2188template <typename T, typename Index>
2190 const Vec512<Index> index) {
2191 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2192 return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2193}
2194
2196 const float* HWY_RESTRICT base,
2197 const Vec512<int32_t> offset) {
2198 return Vec512<float>{_mm512_i32gather_ps(offset.raw, base, 1)};
2199}
2201 const float* HWY_RESTRICT base,
2202 const Vec512<int32_t> index) {
2203 return Vec512<float>{_mm512_i32gather_ps(index.raw, base, 4)};
2204}
2205
2207 const double* HWY_RESTRICT base,
2208 const Vec512<int64_t> offset) {
2209 return Vec512<double>{_mm512_i64gather_pd(offset.raw, base, 1)};
2210}
2212 const double* HWY_RESTRICT base,
2213 const Vec512<int64_t> index) {
2214 return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
2215}
2216
2217HWY_DIAGNOSTICS(pop)
2218
2219// ================================================== SWIZZLE
2220
2221// ------------------------------ LowerHalf
2222
2223template <typename T>
2225 return Vec256<T>{_mm512_castsi512_si256(v.raw)};
2226}
2228 return Vec256<float>{_mm512_castps512_ps256(v.raw)};
2229}
2231 return Vec256<double>{_mm512_castpd512_pd256(v.raw)};
2232}
2233
2234template <typename T>
2238
2239// ------------------------------ UpperHalf
2240
2241template <typename T>
2243 return Vec256<T>{_mm512_extracti32x8_epi32(v.raw, 1)};
2244}
2246 return Vec256<float>{_mm512_extractf32x8_ps(v.raw, 1)};
2247}
2249 return Vec256<double>{_mm512_extractf64x4_pd(v.raw, 1)};
2250}
2251
2252// ------------------------------ ExtractLane (Store)
2253template <typename T>
2254HWY_API T ExtractLane(const Vec512<T> v, size_t i) {
2255 const Full512<T> d;
2256 HWY_DASSERT(i < Lanes(d));
2257 alignas(64) T lanes[64 / sizeof(T)];
2258 Store(v, d, lanes);
2259 return lanes[i];
2260}
2261
2262// ------------------------------ InsertLane (Store)
2263template <typename T>
2264HWY_API Vec512<T> InsertLane(const Vec512<T> v, size_t i, T t) {
2265 const Full512<T> d;
2266 HWY_DASSERT(i < Lanes(d));
2267 alignas(64) T lanes[64 / sizeof(T)];
2268 Store(v, d, lanes);
2269 lanes[i] = t;
2270 return Load(d, lanes);
2271}
2272
2273// ------------------------------ GetLane (LowerHalf)
2274template <typename T>
2276 return GetLane(LowerHalf(v));
2277}
2278
2279// ------------------------------ ZeroExtendVector
2280
2281template <typename T>
2283#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
2284 return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
2285#else
2286 return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
2287#endif
2288}
2290 Vec256<float> lo) {
2291#if HWY_HAVE_ZEXT
2292 return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
2293#else
2294 return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
2295#endif
2296}
2298 Vec256<double> lo) {
2299#if HWY_HAVE_ZEXT
2300 return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
2301#else
2302 return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
2303#endif
2304}
2305
2306// ------------------------------ Combine
2307
2308template <typename T>
2310 const auto lo512 = ZeroExtendVector(d, lo);
2311 return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.raw, 1)};
2312}
2314 Vec256<float> lo) {
2315 const auto lo512 = ZeroExtendVector(d, lo);
2316 return Vec512<float>{_mm512_insertf32x8(lo512.raw, hi.raw, 1)};
2317}
2319 Vec256<double> lo) {
2320 const auto lo512 = ZeroExtendVector(d, lo);
2321 return Vec512<double>{_mm512_insertf64x4(lo512.raw, hi.raw, 1)};
2322}
2323
2324// ------------------------------ ShiftLeftBytes
2325
2326template <int kBytes, typename T>
2328 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2329 return Vec512<T>{_mm512_bslli_epi128(v.raw, kBytes)};
2330}
2331
2332template <int kBytes, typename T>
2336
2337// ------------------------------ ShiftLeftLanes
2338
2339template <int kLanes, typename T>
2341 const Repartition<uint8_t, decltype(d)> d8;
2342 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2343}
2344
2345template <int kLanes, typename T>
2349
2350// ------------------------------ ShiftRightBytes
2351template <int kBytes, typename T>
2353 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2354 return Vec512<T>{_mm512_bsrli_epi128(v.raw, kBytes)};
2355}
2356
2357// ------------------------------ ShiftRightLanes
2358template <int kLanes, typename T>
2360 const Repartition<uint8_t, decltype(d)> d8;
2361 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2362}
2363
2364// ------------------------------ CombineShiftRightBytes
2365
2366template <int kBytes, typename T, class V = Vec512<T>>
2368 const Repartition<uint8_t, decltype(d)> d8;
2369 return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8(
2370 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2371}
2372
2373// ------------------------------ Broadcast/splat any lane
2374
2375// Unsigned
2376template <int kLane>
2378 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2379 if (kLane < 4) {
2380 const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2381 return Vec512<uint16_t>{_mm512_unpacklo_epi64(lo, lo)};
2382 } else {
2383 const __m512i hi =
2384 _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2385 return Vec512<uint16_t>{_mm512_unpackhi_epi64(hi, hi)};
2386 }
2387}
2388template <int kLane>
2390 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2391 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2392 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2393}
2394template <int kLane>
2396 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2397 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2398 return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2399}
2400
2401// Signed
2402template <int kLane>
2404 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2405 if (kLane < 4) {
2406 const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2407 return Vec512<int16_t>{_mm512_unpacklo_epi64(lo, lo)};
2408 } else {
2409 const __m512i hi =
2410 _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2411 return Vec512<int16_t>{_mm512_unpackhi_epi64(hi, hi)};
2412 }
2413}
2414template <int kLane>
2416 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2417 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2418 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2419}
2420template <int kLane>
2422 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2423 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2424 return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2425}
2426
2427// Float
2428template <int kLane>
2430 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2431 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2432 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)};
2433}
2434template <int kLane>
2436 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2437 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane);
2438 return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)};
2439}
2440
2441// ------------------------------ Hard-coded shuffles
2442
2443// Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2444// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2445// right (the previous least-significant lane is now most-significant =>
2446// 47650321). These could also be implemented via CombineShiftRightBytes but
2447// the shuffle_abcd notation is more convenient.
2448
2449// Swap 32-bit halves in 64-bit halves.
2450template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2452 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
2453}
2455 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)};
2456}
2457
2458namespace detail {
2459
2460template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2462 const Full512<T> d;
2463 const RebindToFloat<decltype(d)> df;
2464 return BitCast(
2465 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2466 _MM_PERM_CDAB)});
2467}
2468template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2470 const Full512<T> d;
2471 const RebindToFloat<decltype(d)> df;
2472 return BitCast(
2473 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2474 _MM_PERM_BCDA)});
2475}
2476template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2478 const Full512<T> d;
2479 const RebindToFloat<decltype(d)> df;
2480 return BitCast(
2481 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2482 _MM_PERM_DABC)});
2483}
2484
2485} // namespace detail
2486
2487// Swap 64-bit halves
2489 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2490}
2492 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2493}
2495 // Shorter encoding than _mm512_permute_ps.
2496 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)};
2497}
2499 return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2500}
2502 return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2503}
2505 // Shorter encoding than _mm512_permute_pd.
2506 return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)};
2507}
2508
2509// Rotate right 32 bits
2511 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2512}
2514 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2515}
2517 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)};
2518}
2519// Rotate left 32 bits
2521 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2522}
2524 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2525}
2527 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)};
2528}
2529
2530// Reverse
2532 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2533}
2535 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2536}
2538 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)};
2539}
2540
2541// ------------------------------ TableLookupLanes
2542
2543// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2544template <typename T>
2546 __m512i raw;
2547};
2548
2549template <typename T, typename TI>
2551 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2552#if HWY_IS_DEBUG_BUILD
2553 const Full512<TI> di;
2554 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2555 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T))))));
2556#endif
2557 return Indices512<T>{vec.raw};
2558}
2559
2560template <typename T, typename TI>
2562 const Rebind<TI, decltype(d)> di;
2563 return IndicesFromVec(d, LoadU(di, idx));
2564}
2565
2566template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2568 return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
2569}
2570
2571template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2572HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
2573 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)};
2574}
2575
2577 return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
2578}
2579
2581 Indices512<double> idx) {
2582 return Vec512<double>{_mm512_permutexvar_pd(idx.raw, v.raw)};
2583}
2584
2585// ------------------------------ Reverse
2586
2587template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2589 const RebindToSigned<decltype(d)> di;
2590 alignas(64) constexpr int16_t kReverse[32] = {
2591 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2592 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2593 const Vec512<int16_t> idx = Load(di, kReverse);
2594 return BitCast(d, Vec512<int16_t>{
2595 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2596}
2597
2598template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2599HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2600 alignas(64) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2601 7, 6, 5, 4, 3, 2, 1, 0};
2602 return TableLookupLanes(v, SetTableIndices(d, kReverse));
2603}
2604
2605template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2606HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2607 alignas(64) constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2608 return TableLookupLanes(v, SetTableIndices(d, kReverse));
2609}
2610
2611// ------------------------------ Reverse2
2612
2613template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2615 const Full512<uint32_t> du32;
2616 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2617}
2618
2619template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2620HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2621 return Shuffle2301(v);
2622}
2623
2624template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2625HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2626 return Shuffle01(v);
2627}
2628
2629// ------------------------------ Reverse4
2630
2631template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2633 const RebindToSigned<decltype(d)> di;
2634 alignas(64) constexpr int16_t kReverse4[32] = {
2635 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2636 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2637 const Vec512<int16_t> idx = Load(di, kReverse4);
2638 return BitCast(d, Vec512<int16_t>{
2639 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2640}
2641
2642template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2643HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2644 return Shuffle0123(v);
2645}
2646
2647template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2648HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2649 return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2650}
2652 return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2653}
2654
2655// ------------------------------ Reverse8
2656
2657template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2659 const RebindToSigned<decltype(d)> di;
2660 alignas(64) constexpr int16_t kReverse8[32] = {
2661 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2662 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2663 const Vec512<int16_t> idx = Load(di, kReverse8);
2664 return BitCast(d, Vec512<int16_t>{
2665 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2666}
2667
2668template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2669HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2670 const RebindToSigned<decltype(d)> di;
2671 alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2672 15, 14, 13, 12, 11, 10, 9, 8};
2673 const Vec512<int32_t> idx = Load(di, kReverse8);
2674 return BitCast(d, Vec512<int32_t>{
2675 _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
2676}
2677
2678template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2679HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2680 return Reverse(d, v);
2681}
2682
2683// ------------------------------ InterleaveLower
2684
2685// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2686// the least-significant lane) and "b". To concatenate two half-width integers
2687// into one, use ZipLower/Upper instead (also works with scalar).
2688
2690 const Vec512<uint8_t> b) {
2691 return Vec512<uint8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2692}
2694 const Vec512<uint16_t> b) {
2695 return Vec512<uint16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2696}
2698 const Vec512<uint32_t> b) {
2699 return Vec512<uint32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2700}
2702 const Vec512<uint64_t> b) {
2703 return Vec512<uint64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2704}
2705
2707 const Vec512<int8_t> b) {
2708 return Vec512<int8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2709}
2711 const Vec512<int16_t> b) {
2712 return Vec512<int16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2713}
2715 const Vec512<int32_t> b) {
2716 return Vec512<int32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2717}
2719 const Vec512<int64_t> b) {
2720 return Vec512<int64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2721}
2722
2724 const Vec512<float> b) {
2725 return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)};
2726}
2728 const Vec512<double> b) {
2729 return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
2730}
2731
2732// ------------------------------ InterleaveUpper
2733
2734// All functions inside detail lack the required D parameter.
2735namespace detail {
2736
2738 const Vec512<uint8_t> b) {
2739 return Vec512<uint8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2740}
2742 const Vec512<uint16_t> b) {
2743 return Vec512<uint16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2744}
2746 const Vec512<uint32_t> b) {
2747 return Vec512<uint32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2748}
2750 const Vec512<uint64_t> b) {
2751 return Vec512<uint64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2752}
2753
2755 const Vec512<int8_t> b) {
2756 return Vec512<int8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2757}
2759 const Vec512<int16_t> b) {
2760 return Vec512<int16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2761}
2763 const Vec512<int32_t> b) {
2764 return Vec512<int32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2765}
2767 const Vec512<int64_t> b) {
2768 return Vec512<int64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2769}
2770
2772 const Vec512<float> b) {
2773 return Vec512<float>{_mm512_unpackhi_ps(a.raw, b.raw)};
2774}
2776 const Vec512<double> b) {
2777 return Vec512<double>{_mm512_unpackhi_pd(a.raw, b.raw)};
2778}
2779
2780} // namespace detail
2781
2782template <typename T, class V = Vec512<T>>
2783HWY_API V InterleaveUpper(Full512<T> /* tag */, V a, V b) {
2784 return detail::InterleaveUpper(a, b);
2785}
2786
2787// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2788
2789// Same as Interleave*, except that the return lanes are double-width integers;
2790// this is necessary because the single-lane scalar cannot return two values.
2791template <typename T, typename TW = MakeWide<T>>
2795template <typename T, typename TW = MakeWide<T>>
2799
2800template <typename T, typename TW = MakeWide<T>>
2804
2805// ------------------------------ Concat* halves
2806
2807// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2808template <typename T>
2810 const Vec512<T> lo) {
2811 return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2812}
2814 const Vec512<float> hi,
2815 const Vec512<float> lo) {
2816 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2817}
2819 const Vec512<double> hi,
2820 const Vec512<double> lo) {
2821 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)};
2822}
2823
2824// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2825template <typename T>
2827 const Vec512<T> lo) {
2828 return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2829}
2831 const Vec512<float> hi,
2832 const Vec512<float> lo) {
2833 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2834}
2836 const Vec512<double> hi,
2837 const Vec512<double> lo) {
2838 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)};
2839}
2840
2841// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
2842template <typename T>
2844 const Vec512<T> lo) {
2845 return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2846}
2848 const Vec512<float> hi,
2849 const Vec512<float> lo) {
2850 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2851}
2853 const Vec512<double> hi,
2854 const Vec512<double> lo) {
2855 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
2856}
2857
2858// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2859template <typename T>
2861 const Vec512<T> lo) {
2862 // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
2863 // are efficiently loaded from 32-bit regs.
2864 const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
2865 return Vec512<T>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
2866}
2868 const Vec512<float> hi,
2869 const Vec512<float> lo) {
2870 const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF);
2871 return Vec512<float>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
2872}
2874 const Vec512<double> hi,
2875 const Vec512<double> lo) {
2876 const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F);
2877 return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)};
2878}
2879
2880// ------------------------------ ConcatOdd
2881
2882template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2884 const RebindToUnsigned<decltype(d)> du;
2885#if HWY_TARGET == HWY_AVX3_DL
2886 alignas(64) constexpr uint8_t kIdx[64] = {
2887 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
2888 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
2889 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
2890 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
2891 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
2892 return BitCast(d,
2893 Vec512<uint8_t>{_mm512_mask2_permutex2var_epi8(
2894 BitCast(du, lo).raw, Load(du, kIdx).raw,
2895 __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
2896#else
2897 const RepartitionToWide<decltype(du)> dw;
2898 // Right-shift 8 bits per u16 so we can pack.
2899 const Vec512<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
2900 const Vec512<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
2901 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
2902 // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
2903 const Full512<uint64_t> du64;
2904 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2905 return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
2906#endif
2907}
2908
2909template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2910HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2911 const RebindToUnsigned<decltype(d)> du;
2912 alignas(64) constexpr uint16_t kIdx[32] = {
2913 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2914 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
2915 return BitCast(d, Vec512<uint16_t>{_mm512_mask2_permutex2var_epi16(
2916 BitCast(du, lo).raw, Load(du, kIdx).raw,
2917 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
2918}
2919
2920template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2921HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2922 const RebindToUnsigned<decltype(d)> du;
2923 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2924 17, 19, 21, 23, 25, 27, 29, 31};
2925 return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2926 BitCast(du, lo).raw, Load(du, kIdx).raw,
2927 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2928}
2929
2931 Vec512<float> lo) {
2932 const RebindToUnsigned<decltype(d)> du;
2933 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2934 17, 19, 21, 23, 25, 27, 29, 31};
2935 return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2936 __mmask16{0xFFFF}, hi.raw)};
2937}
2938
2939template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2940HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2941 const RebindToUnsigned<decltype(d)> du;
2942 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2943 return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2944 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2945 BitCast(du, hi).raw)});
2946}
2947
2949 Vec512<double> lo) {
2950 const RebindToUnsigned<decltype(d)> du;
2951 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2952 return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2953 __mmask8{0xFF}, hi.raw)};
2954}
2955
2956// ------------------------------ ConcatEven
2957
2958template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2960 const RebindToUnsigned<decltype(d)> du;
2961#if HWY_TARGET == HWY_AVX3_DL
2962 alignas(64) constexpr uint8_t kIdx[64] = {
2963 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
2964 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
2965 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
2966 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
2967 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
2968 return BitCast(d,
2969 Vec512<uint32_t>{_mm512_mask2_permutex2var_epi8(
2970 BitCast(du, lo).raw, Load(du, kIdx).raw,
2971 __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
2972#else
2973 const RepartitionToWide<decltype(du)> dw;
2974 // Isolate lower 8 bits per u16 so we can pack.
2975 const Vec512<uint16_t> mask = Set(dw, 0x00FF);
2976 const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
2977 const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
2978 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
2979 // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
2980 const Full512<uint64_t> du64;
2981 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2982 return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
2983#endif
2984}
2985
2986template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2987HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2988 const RebindToUnsigned<decltype(d)> du;
2989 alignas(64) constexpr uint16_t kIdx[32] = {
2990 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
2991 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
2992 return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi16(
2993 BitCast(du, lo).raw, Load(du, kIdx).raw,
2994 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
2995}
2996
2997template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2998HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2999 const RebindToUnsigned<decltype(d)> du;
3000 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3001 16, 18, 20, 22, 24, 26, 28, 30};
3002 return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
3003 BitCast(du, lo).raw, Load(du, kIdx).raw,
3004 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
3005}
3006
3008 Vec512<float> lo) {
3009 const RebindToUnsigned<decltype(d)> du;
3010 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3011 16, 18, 20, 22, 24, 26, 28, 30};
3012 return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3013 __mmask16{0xFFFF}, hi.raw)};
3014}
3015
3016template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3017HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
3018 const RebindToUnsigned<decltype(d)> du;
3019 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3020 return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
3021 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3022 BitCast(du, hi).raw)});
3023}
3024
3026 Vec512<double> lo) {
3027 const RebindToUnsigned<decltype(d)> du;
3028 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3029 return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3030 __mmask8{0xFF}, hi.raw)};
3031}
3032
3033// ------------------------------ DupEven (InterleaveLower)
3034
3035template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3037 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
3038}
3040 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
3041}
3042
3043template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3044HWY_API Vec512<T> DupEven(const Vec512<T> v) {
3045 return InterleaveLower(Full512<T>(), v, v);
3046}
3047
3048// ------------------------------ DupOdd (InterleaveUpper)
3049
3050template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3052 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
3053}
3055 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
3056}
3057
3058template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3059HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
3060 return InterleaveUpper(Full512<T>(), v, v);
3061}
3062
3063// ------------------------------ OddEven
3064
3065template <typename T>
3067 constexpr size_t s = sizeof(T);
3068 constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
3069 return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
3070}
3071
3072// ------------------------------ OddEvenBlocks
3073
3074template <typename T>
3076 return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
3077}
3078
3080 return Vec512<float>{
3081 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)};
3082}
3083
3085 return Vec512<double>{
3086 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)};
3087}
3088
3089// ------------------------------ SwapAdjacentBlocks
3090
3091template <typename T>
3093 return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
3094}
3095
3097 return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
3098}
3099
3101 return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
3102}
3103
3104// ------------------------------ ReverseBlocks
3105
3106template <typename T>
3108 return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
3109}
3111 return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
3112}
3114 Vec512<double> v) {
3115 return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
3116}
3117
3118// ------------------------------ TableLookupBytes (ZeroExtendVector)
3119
3120// Both full
3121template <typename T, typename TI>
3123 return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)};
3124}
3125
3126// Partial index vector
3127template <typename T, typename TI, size_t NI>
3129 const Full512<TI> d512;
3130 const Half<decltype(d512)> d256;
3131 const Half<decltype(d256)> d128;
3132 // First expand to full 128, then 256, then 512.
3133 const Vec128<TI> from_full{from.raw};
3134 const auto from_512 =
3135 ZeroExtendVector(d512, ZeroExtendVector(d256, from_full));
3136 const auto tbl_full = TableLookupBytes(bytes, from_512);
3137 // Shrink to 256, then 128, then partial.
3138 return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw};
3139}
3140template <typename T, typename TI>
3142 const auto from_512 = ZeroExtendVector(Full512<TI>(), from);
3143 return LowerHalf(Full256<TI>(), TableLookupBytes(bytes, from_512));
3144}
3145
3146// Partial table vector
3147template <typename T, size_t N, typename TI>
3149 const Full512<TI> d512;
3150 const Half<decltype(d512)> d256;
3151 const Half<decltype(d256)> d128;
3152 // First expand to full 128, then 256, then 512.
3153 const Vec128<T> bytes_full{bytes.raw};
3154 const auto bytes_512 =
3155 ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full));
3156 return TableLookupBytes(bytes_512, from);
3157}
3158template <typename T, typename TI>
3160 const auto bytes_512 = ZeroExtendVector(Full512<T>(), bytes);
3161 return TableLookupBytes(bytes_512, from);
3162}
3163
3164// Partial both are handled by x86_128/256.
3165
3166// ================================================== CONVERT
3167
3168// ------------------------------ Promotions (part w/ narrow lanes -> full)
3169
3170// Unsigned: zero-extend.
3171// Note: these have 3 cycle latency; if inputs are already split across the
3172// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3175 return Vec512<uint16_t>{_mm512_cvtepu8_epi16(v.raw)};
3176}
3179 return Vec512<uint32_t>{_mm512_cvtepu8_epi32(v.raw)};
3180}
3183 return Vec512<int16_t>{_mm512_cvtepu8_epi16(v.raw)};
3184}
3187 return Vec512<int32_t>{_mm512_cvtepu8_epi32(v.raw)};
3188}
3191 return Vec512<uint32_t>{_mm512_cvtepu16_epi32(v.raw)};
3192}
3195 return Vec512<int32_t>{_mm512_cvtepu16_epi32(v.raw)};
3196}
3199 return Vec512<uint64_t>{_mm512_cvtepu32_epi64(v.raw)};
3200}
3201
3202// Signed: replicate sign bit.
3203// Note: these have 3 cycle latency; if inputs are already split across the
3204// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3205// signed shift would be faster.
3207 Vec256<int8_t> v) {
3208 return Vec512<int16_t>{_mm512_cvtepi8_epi16(v.raw)};
3209}
3211 Vec128<int8_t> v) {
3212 return Vec512<int32_t>{_mm512_cvtepi8_epi32(v.raw)};
3213}
3216 return Vec512<int32_t>{_mm512_cvtepi16_epi32(v.raw)};
3217}
3220 return Vec512<int64_t>{_mm512_cvtepi32_epi64(v.raw)};
3221}
3222
3223// Float
3225 const Vec256<float16_t> v) {
3226 return Vec512<float>{_mm512_cvtph_ps(v.raw)};
3227}
3228
3230 const Vec256<bfloat16_t> v) {
3231 const Rebind<uint16_t, decltype(df32)> du16;
3232 const RebindToSigned<decltype(df32)> di32;
3233 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3234}
3235
3237 return Vec512<double>{_mm512_cvtps_pd(v.raw)};
3238}
3239
3241 return Vec512<double>{_mm512_cvtepi32_pd(v.raw)};
3242}
3243
3244// ------------------------------ Demotions (full -> part w/ narrow lanes)
3245
3247 const Vec512<int32_t> v) {
3248 const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3249
3250 // Compress even u64 lanes into 256 bit.
3251 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3252 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3253 const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)};
3254 return LowerHalf(even);
3255}
3256
3258 const Vec512<int32_t> v) {
3259 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3260
3261 // Compress even u64 lanes into 256 bit.
3262 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3263 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3264 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3265 return LowerHalf(even);
3266}
3267
3269 const Vec512<int32_t> v) {
3270 const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3271 // packus treats the input as signed; we want unsigned. Clear the MSB to get
3272 // unsigned saturation to u8.
3273 const Vec512<int16_t> i16{
3274 _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3275 const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
3276
3277 alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3278 const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3279 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3280 return LowerHalf(LowerHalf(fixed));
3281}
3282
3284 const Vec512<int16_t> v) {
3285 const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)};
3286
3287 // Compress even u64 lanes into 256 bit.
3288 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3289 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3290 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3291 return LowerHalf(even);
3292}
3293
3295 const Vec512<int32_t> v) {
3296 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3297 const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
3298
3299 alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3300 0, 4, 8, 12, 0, 4, 8, 12};
3301 const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3302 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3303 return LowerHalf(LowerHalf(fixed));
3304}
3305
3307 const Vec512<int16_t> v) {
3308 const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
3309
3310 // Compress even u64 lanes into 256 bit.
3311 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3312 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3313 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3314 return LowerHalf(even);
3315}
3316
3318 const Vec512<float> v) {
3319 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3320 HWY_DIAGNOSTICS(push)
3321 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3322 return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3323 HWY_DIAGNOSTICS(pop)
3324}
3325
3327 const Vec512<float> v) {
3328 // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
3329 const Rebind<int32_t, decltype(dbf16)> di32;
3330 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3331 const Rebind<uint16_t, decltype(dbf16)> du16;
3332 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3333 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3334}
3335
3338 // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
3339 const RebindToUnsigned<decltype(dbf16)> du16;
3340 const Repartition<uint32_t, decltype(dbf16)> du32;
3341 const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
3342 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3343}
3344
3347 return Vec512<int16_t>{_mm512_packs_epi32(a.raw, b.raw)};
3348}
3349
3351 const Vec512<double> v) {
3352 return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
3353}
3354
3356 const Vec512<double> v) {
3357 const auto clamped = detail::ClampF64ToI32Max(Full512<double>(), v);
3358 return Vec256<int32_t>{_mm512_cvttpd_epi32(clamped.raw)};
3359}
3360
3361// For already range-limited input [0, 255].
3363 const Full512<uint32_t> d32;
3364 // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
3365 // lowest 4 bytes.
3366 alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3367 ~0u};
3368 const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
3369 // Gather the lowest 4 bytes of 4 128-bit blocks.
3370 alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3371 const Vec512<uint8_t> bytes{
3372 _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
3373 return LowerHalf(LowerHalf(bytes));
3374}
3375
3376// ------------------------------ Truncations
3377
3379 const Vec512<uint64_t> v) {
3380#if HWY_TARGET == HWY_AVX3_DL
3381 (void)d;
3382 const Full512<uint8_t> d8;
3383 alignas(16) static constexpr uint8_t k8From64[16] = {
3384 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
3385 const Vec512<uint8_t> bytes{
3386 _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
3387 return LowerHalf(LowerHalf(LowerHalf(bytes)));
3388#else
3389 const Full512<uint32_t> d32;
3390 alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3391 0, 2, 4, 6, 8, 10, 12, 14};
3392 const Vec512<uint32_t> even{
3393 _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
3394 return TruncateTo(d, LowerHalf(even));
3395#endif
3396}
3397
3399 const Vec512<uint64_t> v) {
3400 const Full512<uint16_t> d16;
3401 alignas(16) static constexpr uint16_t k16From64[8] = {
3402 0, 4, 8, 12, 16, 20, 24, 28};
3403 const Vec512<uint16_t> bytes{
3404 _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)};
3405 return LowerHalf(LowerHalf(bytes));
3406}
3407
3409 const Vec512<uint64_t> v) {
3410 const Full512<uint32_t> d32;
3411 alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3412 0, 2, 4, 6, 8, 10, 12, 14};
3413 const Vec512<uint32_t> even{
3414 _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
3415 return LowerHalf(even);
3416}
3417
3419 const Vec512<uint32_t> v) {
3420#if HWY_TARGET == HWY_AVX3_DL
3421 const Full512<uint8_t> d8;
3422 alignas(16) static constexpr uint8_t k8From32[16] = {
3423 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
3424 const Vec512<uint8_t> bytes{
3425 _mm512_permutexvar_epi32(LoadDup128(d8, k8From32).raw, v.raw)};
3426#else
3427 const Full512<uint32_t> d32;
3428 // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
3429 // lowest 4 bytes.
3430 alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3431 ~0u};
3432 const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
3433 // Gather the lowest 4 bytes of 4 128-bit blocks.
3434 alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3435 const Vec512<uint8_t> bytes{
3436 _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
3437#endif
3438 return LowerHalf(LowerHalf(bytes));
3439}
3440
3442 const Vec512<uint32_t> v) {
3443 const Full512<uint16_t> d16;
3444 alignas(64) static constexpr uint16_t k16From32[32] = {
3445 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3446 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
3447 const Vec512<uint16_t> bytes{
3448 _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)};
3449 return LowerHalf(bytes);
3450}
3451
3453 const Vec512<uint16_t> v) {
3454#if HWY_TARGET == HWY_AVX3_DL
3455 const Full512<uint8_t> d8;
3456 alignas(64) static constexpr uint8_t k8From16[64] = {
3457 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3458 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
3459 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3460 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3461 const Vec512<uint8_t> bytes{
3462 _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
3463#else
3464 const Full512<uint32_t> d32;
3465 alignas(16) static constexpr uint32_t k16From32[4] = {
3466 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
3467 const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32));
3468 alignas(64) static constexpr uint32_t kIndex32[16] = {
3469 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
3470 const Vec512<uint8_t> bytes{
3471 _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)};
3472#endif
3473 return LowerHalf(bytes);
3474}
3475
3476// ------------------------------ Convert integer <=> floating point
3477
3479 const Vec512<int32_t> v) {
3480 return Vec512<float>{_mm512_cvtepi32_ps(v.raw)};
3481}
3482
3484 const Vec512<int64_t> v) {
3485 return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
3486}
3487
3489 const Vec512<uint32_t> v) {
3490 return Vec512<float>{_mm512_cvtepu32_ps(v.raw)};
3491}
3492
3494 const Vec512<uint64_t> v) {
3495 return Vec512<double>{_mm512_cvtepu64_pd(v.raw)};
3496}
3497
3498// Truncates (rounds toward zero).
3500 return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
3501}
3503 return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw));
3504}
3505
3507 const Full512<int32_t> di;
3508 return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw));
3509}
3510
3511// ================================================== CRYPTO
3512
3513#if !defined(HWY_DISABLE_PCLMUL_AES)
3514
3515// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3516#ifdef HWY_NATIVE_AES
3517#undef HWY_NATIVE_AES
3518#else
3519#define HWY_NATIVE_AES
3520#endif
3521
3523 Vec512<uint8_t> round_key) {
3524#if HWY_TARGET == HWY_AVX3_DL
3525 return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
3526#else
3527 const Full512<uint8_t> d;
3528 const Half<decltype(d)> d2;
3529 return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3530 AESRound(LowerHalf(state), LowerHalf(round_key)));
3531#endif
3532}
3533
3535 Vec512<uint8_t> round_key) {
3536#if HWY_TARGET == HWY_AVX3_DL
3537 return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
3538#else
3539 const Full512<uint8_t> d;
3540 const Half<decltype(d)> d2;
3541 return Combine(d,
3542 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3543 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
3544#endif
3545}
3546
3548#if HWY_TARGET == HWY_AVX3_DL
3549 return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)};
3550#else
3551 alignas(64) uint64_t a[8];
3552 alignas(64) uint64_t b[8];
3553 const Full512<uint64_t> d;
3554 const Full128<uint64_t> d128;
3555 Store(va, d, a);
3556 Store(vb, d, b);
3557 for (size_t i = 0; i < 8; i += 2) {
3558 const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i));
3559 Store(mul, d128, a + i);
3560 }
3561 return Load(d, a);
3562#endif
3563}
3564
3566#if HWY_TARGET == HWY_AVX3_DL
3567 return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)};
3568#else
3569 alignas(64) uint64_t a[8];
3570 alignas(64) uint64_t b[8];
3571 const Full512<uint64_t> d;
3572 const Full128<uint64_t> d128;
3573 Store(va, d, a);
3574 Store(vb, d, b);
3575 for (size_t i = 0; i < 8; i += 2) {
3576 const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i));
3577 Store(mul, d128, a + i);
3578 }
3579 return Load(d, a);
3580#endif
3581}
3582
3583#endif // HWY_DISABLE_PCLMUL_AES
3584
3585// ================================================== MISC
3586
3587// Returns a vector with lane i=[0, N) set to "first" + i.
3588template <typename T, typename T2>
3589Vec512<T> Iota(const Full512<T> d, const T2 first) {
3590 HWY_ALIGN T lanes[64 / sizeof(T)];
3591 for (size_t i = 0; i < 64 / sizeof(T); ++i) {
3592 lanes[i] =
3593 AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
3594 }
3595 return Load(d, lanes);
3596}
3597
3598// ------------------------------ Mask testing
3599
3600// Beware: the suffix indicates the number of mask bits, not lane size!
3601
3602namespace detail {
3603
3604template <typename T>
3606#if HWY_COMPILER_HAS_MASK_INTRINSICS
3607 return _kortestz_mask64_u8(mask.raw, mask.raw);
3608#else
3609 return mask.raw == 0;
3610#endif
3611}
3612template <typename T>
3614#if HWY_COMPILER_HAS_MASK_INTRINSICS
3615 return _kortestz_mask32_u8(mask.raw, mask.raw);
3616#else
3617 return mask.raw == 0;
3618#endif
3619}
3620template <typename T>
3622#if HWY_COMPILER_HAS_MASK_INTRINSICS
3623 return _kortestz_mask16_u8(mask.raw, mask.raw);
3624#else
3625 return mask.raw == 0;
3626#endif
3627}
3628template <typename T>
3630#if HWY_COMPILER_HAS_MASK_INTRINSICS
3631 return _kortestz_mask8_u8(mask.raw, mask.raw);
3632#else
3633 return mask.raw == 0;
3634#endif
3635}
3636
3637} // namespace detail
3638
3639template <typename T>
3640HWY_API bool AllFalse(const Full512<T> /* tag */, const Mask512<T> mask) {
3641 return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
3642}
3643
3644namespace detail {
3645
3646template <typename T>
3648#if HWY_COMPILER_HAS_MASK_INTRINSICS
3649 return _kortestc_mask64_u8(mask.raw, mask.raw);
3650#else
3651 return mask.raw == 0xFFFFFFFFFFFFFFFFull;
3652#endif
3653}
3654template <typename T>
3656#if HWY_COMPILER_HAS_MASK_INTRINSICS
3657 return _kortestc_mask32_u8(mask.raw, mask.raw);
3658#else
3659 return mask.raw == 0xFFFFFFFFull;
3660#endif
3661}
3662template <typename T>
3664#if HWY_COMPILER_HAS_MASK_INTRINSICS
3665 return _kortestc_mask16_u8(mask.raw, mask.raw);
3666#else
3667 return mask.raw == 0xFFFFull;
3668#endif
3669}
3670template <typename T>
3672#if HWY_COMPILER_HAS_MASK_INTRINSICS
3673 return _kortestc_mask8_u8(mask.raw, mask.raw);
3674#else
3675 return mask.raw == 0xFFull;
3676#endif
3677}
3678
3679} // namespace detail
3680
3681template <typename T>
3682HWY_API bool AllTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3683 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
3684}
3685
3686// `p` points to at least 8 readable bytes, not all of which need be valid.
3687template <typename T>
3689 const uint8_t* HWY_RESTRICT bits) {
3690 Mask512<T> mask;
3691 CopyBytes<8 / sizeof(T)>(bits, &mask.raw);
3692 // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3693 return mask;
3694}
3695
3696// `p` points to at least 8 writable bytes.
3697template <typename T>
3698HWY_API size_t StoreMaskBits(const Full512<T> /* tag */, const Mask512<T> mask,
3699 uint8_t* bits) {
3700 const size_t kNumBytes = 8 / sizeof(T);
3702 // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3703 return kNumBytes;
3704}
3705
3706template <typename T>
3707HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3708 return PopCount(static_cast<uint64_t>(mask.raw));
3709}
3710
3711template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3713 const Mask512<T> mask) {
3715}
3716
3717template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3718HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
3719 const Mask512<T> mask) {
3720 return Num0BitsBelowLS1Bit_Nonzero64(mask.raw);
3721}
3722
3723template <typename T>
3724HWY_API intptr_t FindFirstTrue(const Full512<T> d, const Mask512<T> mask) {
3725 return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
3726 : intptr_t{-1};
3727}
3728
3729// ------------------------------ Compress
3730
3731// Always implement 8-bit here even if we lack VBMI2 because we can do better
3732// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
3733#ifdef HWY_NATIVE_COMPRESS8
3734#undef HWY_NATIVE_COMPRESS8
3735#else
3736#define HWY_NATIVE_COMPRESS8
3737#endif
3738
3739namespace detail {
3740
3741#if HWY_TARGET == HWY_AVX3_DL // VBMI2
3742template <size_t N>
3744 const Mask128<uint8_t, N> mask) {
3745 return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
3746}
3748 const Mask256<uint8_t> mask) {
3749 return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
3750}
3752 const Mask512<uint8_t> mask) {
3753 return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
3754}
3755
3756template <size_t N>
3758 const Mask128<uint16_t, N> mask) {
3759 return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
3760}
3762 const Mask256<uint16_t> mask) {
3763 return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
3764}
3766 const Mask512<uint16_t> mask) {
3767 return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
3768}
3769
3770template <size_t N>
3773 Simd<uint8_t, N, 0> /* d */,
3774 uint8_t* HWY_RESTRICT unaligned) {
3775 _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
3776}
3778 Full256<uint8_t> /* d */,
3779 uint8_t* HWY_RESTRICT unaligned) {
3780 _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
3781}
3783 Full512<uint8_t> /* d */,
3784 uint8_t* HWY_RESTRICT unaligned) {
3785 _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
3786}
3787
3788template <size_t N>
3791 Simd<uint16_t, N, 0> /* d */,
3792 uint16_t* HWY_RESTRICT unaligned) {
3793 _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
3794}
3796 Full256<uint16_t> /* d */,
3797 uint16_t* HWY_RESTRICT unaligned) {
3798 _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
3799}
3801 Full512<uint16_t> /* d */,
3802 uint16_t* HWY_RESTRICT unaligned) {
3803 _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
3804}
3805
3806#endif // HWY_TARGET == HWY_AVX3_DL
3807
3808template <size_t N>
3810 const Mask128<uint32_t, N> mask) {
3811 return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
3812}
3814 Mask256<uint32_t> mask) {
3815 return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
3816}
3818 Mask512<uint32_t> mask) {
3819 return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
3820}
3821// We use table-based compress for 64-bit lanes, see CompressIsPartition.
3822
3823template <size_t N>
3826 Simd<uint32_t, N, 0> /* d */,
3827 uint32_t* HWY_RESTRICT unaligned) {
3828 _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3829}
3831 Full256<uint32_t> /* d */,
3832 uint32_t* HWY_RESTRICT unaligned) {
3833 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3834}
3836 Full512<uint32_t> /* d */,
3837 uint32_t* HWY_RESTRICT unaligned) {
3838 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3839}
3840
3841template <size_t N>
3844 Simd<uint64_t, N, 0> /* d */,
3845 uint64_t* HWY_RESTRICT unaligned) {
3846 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3847}
3849 Full256<uint64_t> /* d */,
3850 uint64_t* HWY_RESTRICT unaligned) {
3851 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3852}
3854 Full512<uint64_t> /* d */,
3855 uint64_t* HWY_RESTRICT unaligned) {
3856 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3857}
3858
3859// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
3860// only a single compressed vector (u32x16). Other EmuCompress are implemented
3861// after the EmuCompressStore they build upon.
3862template <size_t N>
3864 Mask128<uint8_t, N> mask) {
3865 const Simd<uint8_t, N, 0> d;
3866 const Rebind<uint32_t, decltype(d)> d32;
3867 const auto v0 = PromoteTo(d32, v);
3868
3869 const uint64_t mask_bits{mask.raw};
3870 // Mask type is __mmask16 if v is full 128, else __mmask8.
3871 using M32 = MFromD<decltype(d32)>;
3872 const M32 m0{static_cast<typename M32::Raw>(mask_bits)};
3873 return TruncateTo(d, Compress(v0, m0));
3874}
3875
3876template <size_t N>
3878 Mask128<uint16_t, N> mask) {
3879 const Simd<uint16_t, N, 0> d;
3880 const Rebind<int32_t, decltype(d)> di32;
3881 const RebindToUnsigned<decltype(di32)> du32;
3882 const MFromD<decltype(du32)> mask32{static_cast<__mmask8>(mask.raw)};
3883 // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
3884 // Only i32 -> u16 is supported, whereas NativeCompress expects u32.
3885 const VFromD<decltype(du32)> v32 = BitCast(du32, PromoteTo(di32, v));
3886 return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
3887}
3888
3890 Mask256<uint16_t> mask) {
3891 const Full256<uint16_t> d;
3892 const Rebind<int32_t, decltype(d)> di32;
3893 const RebindToUnsigned<decltype(di32)> du32;
3894 const Mask512<uint32_t> mask32{static_cast<__mmask16>(mask.raw)};
3895 const Vec512<uint32_t> v32 = BitCast(du32, PromoteTo(di32, v));
3896 return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
3897}
3898
3899// See above - small-vector EmuCompressStore are implemented via EmuCompress.
3900template <typename T, size_t N>
3902 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
3903 StoreU(EmuCompress(v, mask), d, unaligned);
3904}
3905
3908 uint16_t* HWY_RESTRICT unaligned) {
3909 StoreU(EmuCompress(v, mask), d, unaligned);
3910}
3911
3912// Main emulation logic for wider vector, starting with EmuCompressStore because
3913// it is most convenient to merge pieces using memory (concatenating vectors at
3914// byte offsets is difficult).
3917 uint8_t* HWY_RESTRICT unaligned) {
3918 const uint64_t mask_bits{mask.raw};
3919 const Half<decltype(d)> dh;
3920 const Rebind<uint32_t, decltype(dh)> d32;
3921 const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(v));
3922 const Vec512<uint32_t> v1 = PromoteTo(d32, UpperHalf(dh, v));
3923 const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
3924 const Mask512<uint32_t> m1{static_cast<__mmask16>(mask_bits >> 16)};
3925 const Vec128<uint8_t> c0 = TruncateTo(dh, NativeCompress(v0, m0));
3926 const Vec128<uint8_t> c1 = TruncateTo(dh, NativeCompress(v1, m1));
3927 uint8_t* HWY_RESTRICT pos = unaligned;
3928 StoreU(c0, dh, pos);
3929 StoreU(c1, dh, pos + CountTrue(d32, m0));
3930}
3931
3934 uint8_t* HWY_RESTRICT unaligned) {
3935 const uint64_t mask_bits{mask.raw};
3936 const Half<Half<decltype(d)>> dq;
3937 const Rebind<uint32_t, decltype(dq)> d32;
3938 HWY_ALIGN uint8_t lanes[64];
3939 Store(v, d, lanes);
3940 const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(LowerHalf(v)));
3941 const Vec512<uint32_t> v1 = PromoteTo(d32, Load(dq, lanes + 16));
3942 const Vec512<uint32_t> v2 = PromoteTo(d32, Load(dq, lanes + 32));
3943 const Vec512<uint32_t> v3 = PromoteTo(d32, Load(dq, lanes + 48));
3944 const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
3945 const Mask512<uint32_t> m1{
3946 static_cast<uint16_t>((mask_bits >> 16) & 0xFFFFu)};
3947 const Mask512<uint32_t> m2{
3948 static_cast<uint16_t>((mask_bits >> 32) & 0xFFFFu)};
3949 const Mask512<uint32_t> m3{static_cast<__mmask16>(mask_bits >> 48)};
3950 const Vec128<uint8_t> c0 = TruncateTo(dq, NativeCompress(v0, m0));
3951 const Vec128<uint8_t> c1 = TruncateTo(dq, NativeCompress(v1, m1));
3952 const Vec128<uint8_t> c2 = TruncateTo(dq, NativeCompress(v2, m2));
3953 const Vec128<uint8_t> c3 = TruncateTo(dq, NativeCompress(v3, m3));
3954 uint8_t* HWY_RESTRICT pos = unaligned;
3955 StoreU(c0, dq, pos);
3956 pos += CountTrue(d32, m0);
3957 StoreU(c1, dq, pos);
3958 pos += CountTrue(d32, m1);
3959 StoreU(c2, dq, pos);
3960 pos += CountTrue(d32, m2);
3961 StoreU(c3, dq, pos);
3962}
3963
3966 uint16_t* HWY_RESTRICT unaligned) {
3967 const Repartition<int32_t, decltype(d)> di32;
3968 const RebindToUnsigned<decltype(di32)> du32;
3969 const Half<decltype(d)> dh;
3970 const Vec512<uint32_t> promoted0 =
3971 BitCast(du32, PromoteTo(di32, LowerHalf(dh, v)));
3972 const Vec512<uint32_t> promoted1 =
3973 BitCast(du32, PromoteTo(di32, UpperHalf(dh, v)));
3974
3975 const uint64_t mask_bits{mask.raw};
3976 const uint64_t maskL = mask_bits & 0xFFFF;
3977 const uint64_t maskH = mask_bits >> 16;
3978 const Mask512<uint32_t> mask0{static_cast<__mmask16>(maskL)};
3979 const Mask512<uint32_t> mask1{static_cast<__mmask16>(maskH)};
3980 const Vec512<uint32_t> compressed0 = NativeCompress(promoted0, mask0);
3981 const Vec512<uint32_t> compressed1 = NativeCompress(promoted1, mask1);
3982
3983 const Vec256<uint16_t> demoted0 = DemoteTo(dh, BitCast(di32, compressed0));
3984 const Vec256<uint16_t> demoted1 = DemoteTo(dh, BitCast(di32, compressed1));
3985
3986 // Store 256-bit halves
3987 StoreU(demoted0, dh, unaligned);
3988 StoreU(demoted1, dh, unaligned + PopCount(maskL));
3989}
3990
3991// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
3992template <typename T> // 1 or 2 bytes
3994 const Full512<T> d;
3995 HWY_ALIGN T buf[2 * 64 / sizeof(T)];
3996 EmuCompressStore(v, mask, d, buf);
3997 return Load(d, buf);
3998}
3999
4001 const Mask256<uint8_t> mask) {
4002 const Full256<uint8_t> d;
4003 HWY_ALIGN uint8_t buf[2 * 32 / sizeof(uint8_t)];
4004 EmuCompressStore(v, mask, d, buf);
4005 return Load(d, buf);
4006}
4007
4008} // namespace detail
4009
4010template <class V, class M, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x6)> // 1 or 2 bytes
4011HWY_API V Compress(V v, const M mask) {
4012 const DFromV<decltype(v)> d;
4013 const RebindToUnsigned<decltype(d)> du;
4014 const auto mu = RebindMask(du, mask);
4015#if HWY_TARGET == HWY_AVX3_DL // VBMI2
4016 return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
4017#else
4018 return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
4019#endif
4020}
4021
4022template <class V, class M, HWY_IF_LANE_SIZE_V(V, 4)>
4023HWY_API V Compress(V v, const M mask) {
4024 const DFromV<decltype(v)> d;
4025 const RebindToUnsigned<decltype(d)> du;
4026 const auto mu = RebindMask(du, mask);
4027 return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
4028}
4029
4030template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4032 // See CompressIsPartition. u64 is faster than u32.
4033 alignas(16) constexpr uint64_t packed_array[256] = {
4034 // From PrintCompress32x8Tables, without the FirstN extension (there is
4035 // no benefit to including them because 64-bit CompressStore is anyway
4036 // masked, but also no harm because TableLookupLanes ignores the MSB).
4037 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4038 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4039 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4040 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4041 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4042 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4043 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4044 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4045 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4046 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4047 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4048 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4049 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4050 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4051 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4052 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4053 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4054 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4055 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4056 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4057 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4058 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4059 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4060 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4061 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4062 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4063 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4064 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4065 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4066 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4067 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4068 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4069 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4070 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4071 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4072 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4073 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4074 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4075 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4076 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4077 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4078 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4079 0x10765432, 0x17654320, 0x07654321, 0x76543210};
4080
4081 // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
4082 // _mm512_permutexvar_epi64 will ignore the upper bits.
4083 const Full512<T> d;
4084 const RebindToUnsigned<decltype(d)> du64;
4085 const auto packed = Set(du64, packed_array[mask.raw]);
4086 alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4087 const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
4088 return TableLookupLanes(v, indices);
4089}
4090
4091// ------------------------------ CompressNot
4092
4093template <class V, class M, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
4094HWY_API V CompressNot(V v, const M mask) {
4095 return Compress(v, Not(mask));
4096}
4097
4098template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4100 // See CompressIsPartition. u64 is faster than u32.
4101 alignas(16) constexpr uint64_t packed_array[256] = {
4102 // From PrintCompressNot32x8Tables, without the FirstN extension (there is
4103 // no benefit to including them because 64-bit CompressStore is anyway
4104 // masked, but also no harm because TableLookupLanes ignores the MSB).
4105 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
4106 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
4107 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
4108 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
4109 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
4110 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
4111 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
4112 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
4113 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
4114 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
4115 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
4116 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
4117 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
4118 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
4119 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
4120 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
4121 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
4122 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
4123 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
4124 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
4125 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
4126 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
4127 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
4128 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
4129 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
4130 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
4131 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
4132 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
4133 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
4134 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
4135 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
4136 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
4137 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
4138 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
4139 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
4140 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
4141 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
4142 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
4143 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
4144 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
4145 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
4146 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
4147 0x76543210, 0x76543201, 0x76543210, 0x76543210};
4148
4149 // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
4150 // _mm512_permutexvar_epi64 will ignore the upper bits.
4151 const Full512<T> d;
4152 const RebindToUnsigned<decltype(d)> du64;
4153 const auto packed = Set(du64, packed_array[mask.raw]);
4154 alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4155 const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
4156 return TableLookupLanes(v, indices);
4157}
4158
4159// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
4160// no-op for 128-bit.
4161template <class V, class M, hwy::EnableIf<(sizeof(V) > 16)>* = nullptr>
4163 return CompressNot(v, mask);
4164}
4165
4166// ------------------------------ CompressBits
4167template <class V>
4169 return Compress(v, LoadMaskBits(DFromV<V>(), bits));
4170}
4171
4172// ------------------------------ CompressStore
4173
4174template <class V, class D, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x6)> // 1 or 2 bytes
4176 TFromD<D>* HWY_RESTRICT unaligned) {
4177 const RebindToUnsigned<decltype(d)> du;
4178 const auto mu = RebindMask(du, mask);
4179 auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);
4180#if HWY_TARGET == HWY_AVX3_DL // VBMI2
4181 detail::NativeCompressStore(BitCast(du, v), mu, du, pu);
4182#else
4183 detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
4184#endif
4185 const size_t count = CountTrue(d, mask);
4186 detail::MaybeUnpoison(pu, count);
4187 return count;
4188}
4189
4190template <class V, class D, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x110)> // 4 or 8
4191HWY_API size_t CompressStore(V v, MFromD<D> mask, D d,
4192 TFromD<D>* HWY_RESTRICT unaligned) {
4193 const RebindToUnsigned<decltype(d)> du;
4194 const auto mu = RebindMask(du, mask);
4195 using TU = TFromD<decltype(du)>;
4196 TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
4197 detail::NativeCompressStore(BitCast(du, v), mu, du, pu);
4198 const size_t count = CountTrue(d, mask);
4199 detail::MaybeUnpoison(pu, count);
4200 return count;
4201}
4202
4203// Additional overloads to avoid casting to uint32_t (delay?).
4205 Full512<float> /* tag */,
4206 float* HWY_RESTRICT unaligned) {
4207 _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
4208 const size_t count = PopCount(uint64_t{mask.raw});
4209 detail::MaybeUnpoison(unaligned, count);
4210 return count;
4211}
4212
4214 Full512<double> /* tag */,
4215 double* HWY_RESTRICT unaligned) {
4216 _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
4217 const size_t count = PopCount(uint64_t{mask.raw});
4218 detail::MaybeUnpoison(unaligned, count);
4219 return count;
4220}
4221
4222// ------------------------------ CompressBlendedStore
4223template <class D, typename T = TFromD<D>>
4225 T* HWY_RESTRICT unaligned) {
4226 // Native CompressStore already does the blending at no extra cost (latency
4227 // 11, rthroughput 2 - same as compress plus store).
4228 if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) > 2) {
4229 return CompressStore(v, m, d, unaligned);
4230 } else {
4231 const size_t count = CountTrue(d, m);
4232 BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
4233 detail::MaybeUnpoison(unaligned, count);
4234 return count;
4235 }
4236}
4237
4238// ------------------------------ CompressBitsStore
4239template <class D>
4241 D d, TFromD<D>* HWY_RESTRICT unaligned) {
4242 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4243}
4244
4245// ------------------------------ LoadInterleaved4
4246
4247// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4.
4248namespace detail {
4249
4250// Type-safe wrapper.
4251template <_MM_PERM_ENUM kPerm, typename T>
4253 return Vec512<T>{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)};
4254}
4255template <_MM_PERM_ENUM kPerm>
4257 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)};
4258}
4259template <_MM_PERM_ENUM kPerm>
4261 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)};
4262}
4263
4264// Input (128-bit blocks):
4265// 3 2 1 0 (<- first block in unaligned)
4266// 7 6 5 4
4267// b a 9 8
4268// Output:
4269// 9 6 3 0 (LSB of A)
4270// a 7 4 1
4271// b 8 5 2
4272template <typename T>
4274 const T* HWY_RESTRICT unaligned,
4275 Vec512<T>& A, Vec512<T>& B, Vec512<T>& C) {
4276 constexpr size_t N = 64 / sizeof(T);
4277 const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
4278 const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
4279 const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
4280
4281 const Vec512<T> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
4282 const Vec512<T> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
4283
4284 A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
4285 B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
4286 C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
4287}
4288
4289// Input (128-bit blocks):
4290// 3 2 1 0 (<- first block in unaligned)
4291// 7 6 5 4
4292// b a 9 8
4293// f e d c
4294// Output:
4295// c 8 4 0 (LSB of A)
4296// d 9 5 1
4297// e a 6 2
4298// f b 7 3
4299template <typename T>
4301 const T* HWY_RESTRICT unaligned,
4302 Vec512<T>& A, Vec512<T>& B, Vec512<T>& C,
4303 Vec512<T>& D) {
4304 constexpr size_t N = 64 / sizeof(T);
4305 const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
4306 const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
4307 const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
4308 const Vec512<T> vfedc = LoadU(d, unaligned + 3 * N);
4309
4310 const Vec512<T> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
4311 const Vec512<T> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
4312 const Vec512<T> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
4313 const Vec512<T> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
4314 A = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
4315 B = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
4316 C = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
4317 D = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
4318}
4319
4320} // namespace detail
4321
4322// ------------------------------ StoreInterleaved2
4323
4324// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
4325
4326namespace detail {
4327
4328// Input (128-bit blocks):
4329// 6 4 2 0 (LSB of i)
4330// 7 5 3 1
4331// Output:
4332// 3 2 1 0
4333// 7 6 5 4
4334template <typename T>
4336 const Full512<T> d,
4337 T* HWY_RESTRICT unaligned) {
4338 constexpr size_t N = 64 / sizeof(T);
4339 const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
4340 const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
4341 const auto j1_i1_j0_i0 =
4342 detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
4343 const auto j3_i3_j2_i2 =
4344 detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
4345 StoreU(j1_i1_j0_i0, d, unaligned + 0 * N);
4346 StoreU(j3_i3_j2_i2, d, unaligned + 1 * N);
4347}
4348
4349// Input (128-bit blocks):
4350// 9 6 3 0 (LSB of i)
4351// a 7 4 1
4352// b 8 5 2
4353// Output:
4354// 3 2 1 0
4355// 7 6 5 4
4356// b a 9 8
4357template <typename T>
4359 const Vec512<T> k, Full512<T> d,
4360 T* HWY_RESTRICT unaligned) {
4361 constexpr size_t N = 64 / sizeof(T);
4362 const Vec512<T> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
4363 const Vec512<T> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
4364 const Vec512<T> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
4365
4366 const Vec512<T> out0 = // i1 k0 j0 i0
4367 detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
4368 const Vec512<T> out1 = // j2 i2 k1 j1
4369 detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
4370 const Vec512<T> out2 = // k3 j3 i3 k2
4371 detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
4372
4373 StoreU(out0, d, unaligned + 0 * N);
4374 StoreU(out1, d, unaligned + 1 * N);
4375 StoreU(out2, d, unaligned + 2 * N);
4376}
4377
4378// Input (128-bit blocks):
4379// c 8 4 0 (LSB of i)
4380// d 9 5 1
4381// e a 6 2
4382// f b 7 3
4383// Output:
4384// 3 2 1 0
4385// 7 6 5 4
4386// b a 9 8
4387// f e d c
4388template <typename T>
4390 const Vec512<T> k, const Vec512<T> l,
4391 Full512<T> d, T* HWY_RESTRICT unaligned) {
4392 constexpr size_t N = 64 / sizeof(T);
4393 const Vec512<T> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
4394 const Vec512<T> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
4395 const Vec512<T> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
4396 const Vec512<T> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
4397 const Vec512<T> out0 =
4398 detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
4399 const Vec512<T> out1 =
4400 detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
4401 const Vec512<T> out2 =
4402 detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
4403 const Vec512<T> out3 =
4404 detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
4405 StoreU(out0, d, unaligned + 0 * N);
4406 StoreU(out1, d, unaligned + 1 * N);
4407 StoreU(out2, d, unaligned + 2 * N);
4408 StoreU(out3, d, unaligned + 3 * N);
4409}
4410
4411} // namespace detail
4412
4413// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
4414
4416 const Vec512<uint64_t> b) {
4417 const Full512<uint64_t> du64;
4418 const RepartitionToNarrow<decltype(du64)> du32;
4419 const auto maskL = Set(du64, 0xFFFFFFFFULL);
4420 const auto a32 = BitCast(du32, a);
4421 const auto b32 = BitCast(du32, b);
4422 // Inputs for MulEven: we only need the lower 32 bits
4423 const auto aH = Shuffle2301(a32);
4424 const auto bH = Shuffle2301(b32);
4425
4426 // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
4427 // the even (lower 64 bits of every 128-bit block) results. See
4428 // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
4429 const auto aLbL = MulEven(a32, b32);
4430 const auto w3 = aLbL & maskL;
4431
4432 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
4433 const auto w2 = t2 & maskL;
4434 const auto w1 = ShiftRight<32>(t2);
4435
4436 const auto t = MulEven(a32, bH) + w2;
4437 const auto k = ShiftRight<32>(t);
4438
4439 const auto mulH = MulEven(aH, bH) + w1 + k;
4440 const auto mulL = ShiftLeft<32>(t) + w3;
4441 return InterleaveLower(mulL, mulH);
4442}
4443
4445 const Vec512<uint64_t> b) {
4446 const Full512<uint64_t> du64;
4447 const RepartitionToNarrow<decltype(du64)> du32;
4448 const auto maskL = Set(du64, 0xFFFFFFFFULL);
4449 const auto a32 = BitCast(du32, a);
4450 const auto b32 = BitCast(du32, b);
4451 // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
4452 const auto aH = Shuffle2301(a32);
4453 const auto bH = Shuffle2301(b32);
4454
4455 // Same as above, but we're using the odd results (upper 64 bits per block).
4456 const auto aLbL = MulEven(a32, b32);
4457 const auto w3 = aLbL & maskL;
4458
4459 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
4460 const auto w2 = t2 & maskL;
4461 const auto w1 = ShiftRight<32>(t2);
4462
4463 const auto t = MulEven(a32, bH) + w2;
4464 const auto k = ShiftRight<32>(t);
4465
4466 const auto mulH = MulEven(aH, bH) + w1 + k;
4467 const auto mulL = ShiftLeft<32>(t) + w3;
4468 return InterleaveUpper(du64, mulL, mulH);
4469}
4470
4471// ------------------------------ ReorderWidenMulAccumulate
4475 const Vec512<int32_t> sum0,
4476 Vec512<int32_t>& /*sum1*/) {
4477 return sum0 + Vec512<int32_t>{_mm512_madd_epi16(a.raw, b.raw)};
4478}
4479
4481 Vec512<int32_t> /*sum1*/) {
4482 return sum0; // invariant already holds
4483}
4484
4485// ------------------------------ Reductions
4486
4487// Returns the sum in each lane.
4489 return Set(d, _mm512_reduce_add_epi32(v.raw));
4490}
4492 return Set(d, _mm512_reduce_add_epi64(v.raw));
4493}
4495 return Set(d, static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw)));
4496}
4498 return Set(d, static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw)));
4499}
4501 return Set(d, _mm512_reduce_add_ps(v.raw));
4502}
4504 return Set(d, _mm512_reduce_add_pd(v.raw));
4505}
4507 const RepartitionToWide<decltype(d)> d32;
4508 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4509 const auto odd = ShiftRight<16>(BitCast(d32, v));
4510 const auto sum = SumOfLanes(d32, even + odd);
4511 // Also broadcast into odd lanes.
4512 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
4513}
4515 const RepartitionToWide<decltype(d)> d32;
4516 // Sign-extend
4517 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
4518 const auto odd = ShiftRight<16>(BitCast(d32, v));
4519 const auto sum = SumOfLanes(d32, even + odd);
4520 // Also broadcast into odd lanes.
4521 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
4522}
4523
4524// Returns the minimum in each lane.
4526 return Set(d, _mm512_reduce_min_epi32(v.raw));
4527}
4529 return Set(d, _mm512_reduce_min_epi64(v.raw));
4530}
4532 return Set(d, _mm512_reduce_min_epu32(v.raw));
4533}
4535 return Set(d, _mm512_reduce_min_epu64(v.raw));
4536}
4538 return Set(d, _mm512_reduce_min_ps(v.raw));
4539}
4541 return Set(d, _mm512_reduce_min_pd(v.raw));
4542}
4544 const RepartitionToWide<decltype(d)> d32;
4545 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4546 const auto odd = ShiftRight<16>(BitCast(d32, v));
4547 const auto min = MinOfLanes(d32, Min(even, odd));
4548 // Also broadcast into odd lanes.
4549 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4550}
4552 const RepartitionToWide<decltype(d)> d32;
4553 // Sign-extend
4554 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
4555 const auto odd = ShiftRight<16>(BitCast(d32, v));
4556 const auto min = MinOfLanes(d32, Min(even, odd));
4557 // Also broadcast into odd lanes.
4558 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4559}
4560
4561// Returns the maximum in each lane.
4563 return Set(d, _mm512_reduce_max_epi32(v.raw));
4564}
4566 return Set(d, _mm512_reduce_max_epi64(v.raw));
4567}
4569 return Set(d, _mm512_reduce_max_epu32(v.raw));
4570}
4572 return Set(d, _mm512_reduce_max_epu64(v.raw));
4573}
4575 return Set(d, _mm512_reduce_max_ps(v.raw));
4576}
4578 return Set(d, _mm512_reduce_max_pd(v.raw));
4579}
4581 const RepartitionToWide<decltype(d)> d32;
4582 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4583 const auto odd = ShiftRight<16>(BitCast(d32, v));
4584 const auto min = MaxOfLanes(d32, Max(even, odd));
4585 // Also broadcast into odd lanes.
4586 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4587}
4589 const RepartitionToWide<decltype(d)> d32;
4590 // Sign-extend
4591 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
4592 const auto odd = ShiftRight<16>(BitCast(d32, v));
4593 const auto min = MaxOfLanes(d32, Max(even, odd));
4594 // Also broadcast into odd lanes.
4595 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
4596}
4597
4598// NOLINTNEXTLINE(google-readability-namespace-comments)
4599} // namespace HWY_NAMESPACE
4600} // namespace hwy
4602
4603// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
4604// the warning seems to be issued at the call site of intrinsics, i.e. our code.
4605HWY_DIAGNOSTICS(pop)
uint8_t buf
Definition BitIO.h:84
size_t offset
Definition BitIO.h:80
uint32_t x
Definition BlockExec.h:38
uint8_t C
Definition FileFormat.h:151
uint32_t type
Definition FileFormat.h:83
uint8_t * bits
Definition TileProcessor.h:59
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_API
Definition base.h:129
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
Definition x86_128-inl.h:137
Raw raw
Definition arm_neon-inl.h:835
Definition x86_128-inl.h:70
Raw raw
Definition arm_neon-inl.h:814
Definition x86_256-inl.h:82
Raw raw
Definition x86_256-inl.h:113
Definition x86_512-inl.h:112
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition x86_512-inl.h:124
typename detail::Raw512< T >::type Raw
Definition x86_512-inl.h:113
Raw raw
Definition x86_512-inl.h:143
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition x86_512-inl.h:136
T PrivateT
Definition x86_512-inl.h:116
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition x86_512-inl.h:127
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition x86_512-inl.h:139
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition x86_512-inl.h:130
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition x86_512-inl.h:121
static constexpr size_t kPrivateN
Definition x86_512-inl.h:117
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition x86_512-inl.h:133
#define HWY_AVX3_DL
Definition detect_targets.h:65
#define HWY_TARGET
Definition detect_targets.h:380
uint32_t a
only used by MQ decoder
Definition mqc.h:48
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2413
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:718
HWY_INLINE Vec128< uint8_t, N > EmuCompress(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:3863
HWY_INLINE void NativeCompressStore(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask, Simd< uint8_t, N, 0 >, uint8_t *HWY_RESTRICT unaligned)
Definition x86_512-inl.h:3771
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition x86_128-inl.h:5571
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:1570
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:470
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:963
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition wasm_128-inl.h:3661
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition x86_256-inl.h:2612
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition x86_128-inl.h:3286
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition x86_128-inl.h:3275
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:862
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition x86_512-inl.h:4252
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition x86_256-inl.h:4543
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5609
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:760
HWY_INLINE Vec128< uint8_t, N > NativeCompress(const Vec128< uint8_t, N > v, const Mask128< uint8_t, N > mask)
Definition x86_512-inl.h:3743
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition generic_ops-inl.h:340
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition x86_512-inl.h:1613
HWY_INLINE void EmuCompressStore(Vec128< T, N > v, Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition x86_512-inl.h:3901
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
Simd< T, 64/sizeof(T), 0 > Full512
Definition x86_512-inl.h:154
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
HWY_API constexpr bool IsSigned()
Definition base.h:642
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 :0x400)>
Definition base.h:627
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:796
HWY_API constexpr bool IsFloat()
Definition base.h:635
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
Definition x86_512-inl.h:2545
__m512i raw
Definition x86_512-inl.h:2546
Definition x86_256-inl.h:143
Raw raw
Definition x86_256-inl.h:150
Definition x86_512-inl.h:148
typename detail::RawMask512< sizeof(T)>::type Raw
Definition x86_512-inl.h:149
Raw raw
Definition x86_512-inl.h:150
Definition ops/shared-inl.h:52
HWY_INLINE __m512d operator()(__m512i v)
Definition x86_512-inl.h:182
HWY_INLINE __m512 operator()(__m512i v)
Definition x86_512-inl.h:178
HWY_INLINE __m512i operator()(__m512i v)
Definition x86_512-inl.h:174
__m512d type
Definition x86_512-inl.h:86
__m512 type
Definition x86_512-inl.h:82
Definition x86_512-inl.h:77
__m512i type
Definition x86_512-inl.h:78
__mmask64 type
Definition x86_512-inl.h:94
__mmask32 type
Definition x86_512-inl.h:98
__mmask16 type
Definition x86_512-inl.h:102
__mmask8 type
Definition x86_512-inl.h:106
Definition x86_512-inl.h:91
Definition base.h:435
uint32_t x1
Definition t1_common.h:75
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()