17#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
22#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
36template <
size_t kBits>
38template <
size_t kBits>
47 using VU16 =
Vec<
decltype(d16)>;
48 const size_t N8 =
Lanes(d8);
70 using VU16 =
Vec<
decltype(d16)>;
71 const size_t N8 =
Lanes(d8);
72 const VU16 mask =
Set(d16, 0x0101u);
76 const VU16 raw0 =
And(packed, mask);
108 using VU16 =
Vec<
decltype(d16)>;
109 const size_t N8 =
Lanes(d8);
111 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
112 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
113 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
114 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
115 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
116 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
117 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
118 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
132 using VU16 =
Vec<
decltype(d16)>;
133 const size_t N8 =
Lanes(d8);
134 const VU16 mask =
Set(d16, 0x0303u);
136 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
137 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
139 const VU16 raw0 =
And(packed0, mask);
142 const VU16 raw1 =
And(packed1, mask);
171 using VU16 =
Vec<
decltype(d16)>;
172 const size_t N8 =
Lanes(d8);
173 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
174 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
175 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
176 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
177 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
178 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
179 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
180 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
188 const VU16 hi2 =
Set(d16, 0xC0C0u);
201 using VU16 =
Vec<
decltype(d16)>;
202 const size_t N8 =
Lanes(d8);
203 const VU16 mask =
Set(d16, 0x0707u);
205 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
206 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
207 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
209 const VU16 raw0 =
And(packed0, mask);
212 const VU16 raw1 =
And(packed1, mask);
215 const VU16 raw2 =
And(packed2, mask);
228 const VU16 hi2 =
Set(d16, 0xC0C0u);
233 const VU16 raw3 =
And(mask, raw73);
247 using VU16 =
Vec<
decltype(d16)>;
248 const size_t N8 =
Lanes(d8);
250 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
251 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
252 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
253 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
254 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
255 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
256 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
257 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
274 using VU16 =
Vec<
decltype(d16)>;
275 const size_t N8 =
Lanes(d8);
276 const VU16 mask =
Set(d16, 0x0F0Fu);
278 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
279 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
280 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
281 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
283 const VU16 raw0 =
And(packed0, mask);
286 const VU16 raw1 =
And(packed1, mask);
295 const VU16 raw4 =
And(packed2, mask);
298 const VU16 raw5 =
And(packed3, mask);
315 using VU16 =
Vec<
decltype(d16)>;
316 const size_t N8 =
Lanes(d8);
317 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
318 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
319 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
320 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
321 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
322 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
323 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
324 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
327 const VU16 hi3 =
Set(d16, 0xE0E0u);
339 const VU16 lo2 =
Set(d16, 0x0303u);
350 using VU16 =
Vec<
decltype(d16)>;
351 const size_t N8 =
Lanes(d8);
353 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
354 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
355 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
356 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
357 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
359 const VU16 mask =
Set(d16, 0x1F1Fu);
361 const VU16 raw0 =
And(packed0, mask);
364 const VU16 raw1 =
And(packed1, mask);
367 const VU16 raw2 =
And(packed2, mask);
370 const VU16 raw3 =
And(packed3, mask);
380 const VU16 lo2 =
Set(d16, 0x0303u);
381 const VU16 raw4 =
OrAnd(top4, lo2, packed4);
399 using VU16 =
Vec<
decltype(d16)>;
400 const size_t N8 =
Lanes(d8);
401 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
402 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
403 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
404 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
405 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
406 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
407 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
408 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
410 const VU16 hi2 =
Set(d16, 0xC0C0u);
431 using VU16 =
Vec<
decltype(d16)>;
432 const size_t N8 =
Lanes(d8);
433 const VU16 mask =
Set(d16, 0x3F3Fu);
435 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
436 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
437 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
438 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
439 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
440 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
442 const VU16 raw0 =
And(packed0, mask);
445 const VU16 raw1 =
And(packed1, mask);
448 const VU16 raw2 =
And(packed2, mask);
451 const VU16 raw4 =
And(packed3, mask);
454 const VU16 raw5 =
And(packed4, mask);
457 const VU16 raw6 =
And(packed5, mask);
478 using VU16 =
Vec<
decltype(d16)>;
479 const size_t N8 =
Lanes(d8);
480 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
481 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
482 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
483 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
484 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
485 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
486 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
488 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
490 const VU16 hi1 =
Set(d16, 0x8080u);
491 const VU16 packed0 =
OrAnd(raw0, Add(raw7, raw7), hi1);
512 using VU16 =
Vec<
decltype(d16)>;
513 const size_t N8 =
Lanes(d8);
515 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
516 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
517 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
518 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
519 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
520 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
521 const VU16 packed6 =
BitCast(d16,
LoadU(d8, packed_in + 6 * N8));
523 const VU16 mask =
Set(d16, 0x7F7Fu);
525 const VU16 raw0 =
And(packed0, mask);
528 const VU16 raw1 =
And(packed1, mask);
531 const VU16 raw2 =
And(packed2, mask);
534 const VU16 raw3 =
And(packed3, mask);
537 const VU16 raw4 =
And(packed4, mask);
540 const VU16 raw5 =
And(packed5, mask);
543 const VU16 raw6 =
And(packed6, mask);
562 using VU8 =
Vec<
decltype(d8)>;
563 const size_t N8 =
Lanes(d8);
564 const VU8 raw0 =
LoadU(d8, raw + 0 * N8);
565 const VU8 raw1 =
LoadU(d8, raw + 1 * N8);
566 const VU8 raw2 =
LoadU(d8, raw + 2 * N8);
567 const VU8 raw3 =
LoadU(d8, raw + 3 * N8);
568 const VU8 raw4 =
LoadU(d8, raw + 4 * N8);
569 const VU8 raw5 =
LoadU(d8, raw + 5 * N8);
570 const VU8 raw6 =
LoadU(d8, raw + 6 * N8);
571 const VU8 raw7 =
LoadU(d8, raw + 7 * N8);
573 StoreU(raw0, d8, packed_out + 0 * N8);
574 StoreU(raw1, d8, packed_out + 1 * N8);
575 StoreU(raw2, d8, packed_out + 2 * N8);
576 StoreU(raw3, d8, packed_out + 3 * N8);
577 StoreU(raw4, d8, packed_out + 4 * N8);
578 StoreU(raw5, d8, packed_out + 5 * N8);
579 StoreU(raw6, d8, packed_out + 6 * N8);
580 StoreU(raw7, d8, packed_out + 7 * N8);
586 using VU8 =
Vec<
decltype(d8)>;
587 const size_t N8 =
Lanes(d8);
588 const VU8 raw0 =
LoadU(d8, packed_in + 0 * N8);
589 const VU8 raw1 =
LoadU(d8, packed_in + 1 * N8);
590 const VU8 raw2 =
LoadU(d8, packed_in + 2 * N8);
591 const VU8 raw3 =
LoadU(d8, packed_in + 3 * N8);
592 const VU8 raw4 =
LoadU(d8, packed_in + 4 * N8);
593 const VU8 raw5 =
LoadU(d8, packed_in + 5 * N8);
594 const VU8 raw6 =
LoadU(d8, packed_in + 6 * N8);
595 const VU8 raw7 =
LoadU(d8, packed_in + 7 * N8);
597 StoreU(raw0, d8, raw + 0 * N8);
598 StoreU(raw1, d8, raw + 1 * N8);
599 StoreU(raw2, d8, raw + 2 * N8);
600 StoreU(raw3, d8, raw + 3 * N8);
601 StoreU(raw4, d8, raw + 4 * N8);
602 StoreU(raw5, d8, raw + 5 * N8);
603 StoreU(raw6, d8, raw + 6 * N8);
604 StoreU(raw7, d8, raw + 7 * N8);
613 using VU16 =
Vec<
decltype(
d)>;
615 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
616 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
617 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
618 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
619 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
620 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
621 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
622 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
623 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
624 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
625 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
626 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
627 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
628 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
629 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
630 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
649 using VU16 =
Vec<
decltype(
d)>;
651 const VU16 mask =
Set(
d, 1u);
653 const VU16 packed =
LoadU(
d, packed_in);
655 const VU16 raw0 =
And(packed, mask);
710 using VU16 =
Vec<
decltype(
d)>;
712 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
713 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
714 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
715 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
716 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
717 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
718 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
719 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
720 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
721 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
722 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
723 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
724 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
725 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
726 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
727 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
739 StoreU(packed0,
d, packed_out + 0 *
N);
740 StoreU(packed1,
d, packed_out + 1 *
N);
746 using VU16 =
Vec<
decltype(
d)>;
748 const VU16 mask =
Set(
d, 0x3u);
750 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
751 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
753 const VU16 raw0 =
And(packed0, mask);
756 const VU16 raw1 =
And(packed1, mask);
808 using VU16 =
Vec<
decltype(
d)>;
810 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
811 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
812 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
813 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
814 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
815 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
816 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
817 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
818 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
819 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
820 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
821 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
822 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
823 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
824 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
825 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
837 const VU16 hi1 =
Set(
d, 0x8000u);
841 StoreU(packed0,
d, packed_out + 0 *
N);
842 StoreU(packed1,
d, packed_out + 1 *
N);
843 StoreU(packed2,
d, packed_out + 2 *
N);
849 using VU16 =
Vec<
decltype(
d)>;
851 const VU16 mask =
Set(
d, 0x7u);
853 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
854 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
855 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
857 const VU16 raw0 =
And(mask, packed0);
860 const VU16 raw1 =
And(mask, packed1);
863 const VU16 raw2 =
And(mask, packed2);
916 using VU16 =
Vec<
decltype(
d)>;
918 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
919 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
920 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
921 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
922 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
923 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
924 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
925 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
926 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
927 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
928 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
929 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
930 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
931 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
932 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
933 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
944 StoreU(packed0,
d, packed_out + 0 *
N);
945 StoreU(packed1,
d, packed_out + 1 *
N);
946 StoreU(packed2,
d, packed_out + 2 *
N);
947 StoreU(packed3,
d, packed_out + 3 *
N);
953 using VU16 =
Vec<
decltype(
d)>;
955 const VU16 mask =
Set(
d, 0xFu);
957 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
958 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
959 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
960 const VU16 packed3 =
LoadU(
d, packed_in + 3 *
N);
962 const VU16 raw0 =
And(packed0, mask);
965 const VU16 raw1 =
And(packed1, mask);
986 const VU16 raw8 =
And(packed2, mask);
989 const VU16 raw9 =
And(packed3, mask);
1017 using VU16 =
Vec<
decltype(
d)>;
1019 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1020 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1021 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1022 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1023 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1024 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1025 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1026 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1027 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1028 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1029 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1030 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1031 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1032 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1033 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1034 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1044 const VU16 hi1 =
Set(
d, 0x8000u);
1051 StoreU(packed0,
d, packed_out + 0 *
N);
1052 StoreU(packed1,
d, packed_out + 1 *
N);
1053 StoreU(packed2,
d, packed_out + 2 *
N);
1054 StoreU(packed3,
d, packed_out + 3 *
N);
1055 StoreU(packed4,
d, packed_out + 4 *
N);
1061 using VU16 =
Vec<
decltype(
d)>;
1064 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
1065 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
1066 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
1067 const VU16 packed3 =
LoadU(
d, packed_in + 3 *
N);
1068 const VU16 packed4 =
LoadU(
d, packed_in + 4 *
N);
1070 const VU16 mask =
Set(
d, 0x1Fu);
1072 const VU16 raw0 =
And(packed0, mask);
1075 const VU16 raw1 =
And(packed1, mask);
1078 const VU16 raw2 =
And(packed2, mask);
1081 const VU16 raw3 =
And(packed3, mask);
1084 const VU16 raw4 =
And(packed4, mask);
1120 const VU16 hi1 =
Set(
d, 0x8000u);
1134 using VU16 =
Vec<
decltype(
d)>;
1136 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1137 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1138 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1139 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1140 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1141 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1142 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1143 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1144 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1145 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1146 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1147 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1148 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1149 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1150 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1151 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1164 const VU16 hi4 =
Set(
d, 0xF000u);
1170 StoreU(packed0,
d, packed_out + 0 *
N);
1171 StoreU(packed1,
d, packed_out + 1 *
N);
1172 StoreU(packed2,
d, packed_out + 2 *
N);
1173 StoreU(packed4,
d, packed_out + 3 *
N);
1174 StoreU(packed5,
d, packed_out + 4 *
N);
1175 StoreU(packed6,
d, packed_out + 5 *
N);
1181 using VU16 =
Vec<
decltype(
d)>;
1183 const VU16 mask =
Set(
d, 0x3Fu);
1185 const VU16 packed0 =
LoadU(
d, packed_in + 0 *
N);
1186 const VU16 packed1 =
LoadU(
d, packed_in + 1 *
N);
1187 const VU16 packed2 =
LoadU(
d, packed_in + 2 *
N);
1188 const VU16 packed4 =
LoadU(
d, packed_in + 3 *
N);
1189 const VU16 packed5 =
LoadU(
d, packed_in + 4 *
N);
1190 const VU16 packed6 =
LoadU(
d, packed_in + 5 *
N);
1192 const VU16 raw0 =
And(packed0, mask);
1195 const VU16 raw1 =
And(packed1, mask);
1198 const VU16 raw2 =
And(packed2, mask);
1210 const VU16 raw8 =
And(packed4, mask);
1213 const VU16 raw9 =
And(packed5, mask);
1216 const VU16 rawA =
And(packed6, mask);
1231 const VU16 hi4 =
Set(
d, 0xF000u);
1236 const VU16 raw3 =
And(packed3, mask);
1239 const VU16 rawB =
And(packed7, mask);
1255 using VU16 =
Vec<
decltype(
d)>;
1257 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1258 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1259 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1260 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1261 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1262 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1263 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1264 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1265 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1266 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1267 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1268 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1269 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1270 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1271 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1272 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1285 const VU16 hi2 =
Set(
d, 0xC000u);
1293 StoreU(packed0,
d, packed_out + 0 *
N);
1294 StoreU(packed1,
d, packed_out + 1 *
N);
1295 StoreU(packed2,
d, packed_out + 2 *
N);
1296 StoreU(packed3,
d, packed_out + 3 *
N);
1297 StoreU(packed4,
d, packed_out + 4 *
N);
1298 StoreU(packed5,
d, packed_out + 5 *
N);
1299 StoreU(packed6,
d, packed_out + 6 *
N);
1305 using VU16 =
Vec<
decltype(
d)>;
1316 const VU16 mask =
Set(
d, 0x7Fu);
1318 const VU16 raw0 =
And(packed0, mask);
1321 const VU16 raw1 =
And(packed1, mask);
1324 const VU16 raw2 =
And(packed2, mask);
1327 const VU16 raw3 =
And(packed3, mask);
1330 const VU16 raw4 =
And(packed4, mask);
1333 const VU16 raw5 =
And(packed5, mask);
1336 const VU16 raw6 =
And(packed6, mask);
1362 const VU16 hi2 =
Set(
d, 0xC000u);
1370 const VU16 raw7 =
And(packed7, mask);
1383 using VU16 =
Vec<
decltype(
d)>;
1385 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1386 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1387 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1388 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1389 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1390 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1391 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1392 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1393 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1394 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1395 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1396 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1397 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1398 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1399 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1400 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1413 StoreU(packed0,
d, packed_out + 0 *
N);
1414 StoreU(packed1,
d, packed_out + 1 *
N);
1415 StoreU(packed2,
d, packed_out + 2 *
N);
1416 StoreU(packed3,
d, packed_out + 3 *
N);
1417 StoreU(packed4,
d, packed_out + 4 *
N);
1418 StoreU(packed5,
d, packed_out + 5 *
N);
1419 StoreU(packed6,
d, packed_out + 6 *
N);
1420 StoreU(packed7,
d, packed_out + 7 *
N);
1426 using VU16 =
Vec<
decltype(
d)>;
1437 const VU16 mask =
Set(
d, 0xFFu);
1439 const VU16 raw0 =
And(packed0, mask);
1442 const VU16 raw1 =
And(packed1, mask);
1451 const VU16 raw4 =
And(packed2, mask);
1454 const VU16 raw5 =
And(packed3, mask);
1463 const VU16 raw8 =
And(packed4, mask);
1466 const VU16 raw9 =
And(packed5, mask);
1475 const VU16 rawC =
And(packed6, mask);
1478 const VU16 rawD =
And(packed7, mask);
1494 using VU16 =
Vec<
decltype(
d)>;
1496 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1497 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1498 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1499 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1500 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1501 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1502 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1503 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1504 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1505 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1506 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1507 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1508 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1509 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1510 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1511 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1525 const VU16 mid2 =
Set(
d, 0x180u);
1534 const VU16 packed8 =
Xor3(
Xor3(part8, part9, partA),
1535 Xor3(partB, partC, partD),
Or(partE, partF));
1537 StoreU(packed0,
d, packed_out + 0 *
N);
1538 StoreU(packed1,
d, packed_out + 1 *
N);
1539 StoreU(packed2,
d, packed_out + 2 *
N);
1540 StoreU(packed3,
d, packed_out + 3 *
N);
1541 StoreU(packed4,
d, packed_out + 4 *
N);
1542 StoreU(packed5,
d, packed_out + 5 *
N);
1543 StoreU(packed6,
d, packed_out + 6 *
N);
1544 StoreU(packed7,
d, packed_out + 7 *
N);
1545 StoreU(packed8,
d, packed_out + 8 *
N);
1551 using VU16 =
Vec<
decltype(
d)>;
1564 const VU16 mask =
Set(
d, 0x1FFu);
1566 const VU16 raw0 =
And(packed0, mask);
1569 const VU16 raw1 =
And(packed1, mask);
1572 const VU16 raw2 =
And(packed2, mask);
1575 const VU16 raw3 =
And(packed3, mask);
1578 const VU16 raw4 =
And(packed4, mask);
1581 const VU16 raw5 =
And(packed5, mask);
1584 const VU16 raw6 =
And(packed6, mask);
1587 const VU16 raw7 =
And(packed7, mask);
1590 const VU16 mid2 =
Set(
d, 0x180u);
1624 using VU16 =
Vec<
decltype(
d)>;
1626 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1627 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1628 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1629 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1630 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1631 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1632 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1633 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1634 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1635 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1636 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1637 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1638 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1639 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1640 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1641 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1657 const VU16 mid4 =
Set(
d, 0x3C0u);
1666 const VU16 packed8 =
Or(
Xor3(part8, part9, partA), partB);
1667 const VU16 packed9 =
Or(
Xor3(partC, partD, partE), partF);
1669 StoreU(packed0,
d, packed_out + 0 *
N);
1670 StoreU(packed1,
d, packed_out + 1 *
N);
1671 StoreU(packed2,
d, packed_out + 2 *
N);
1672 StoreU(packed3,
d, packed_out + 3 *
N);
1673 StoreU(packed4,
d, packed_out + 4 *
N);
1674 StoreU(packed5,
d, packed_out + 5 *
N);
1675 StoreU(packed6,
d, packed_out + 6 *
N);
1676 StoreU(packed7,
d, packed_out + 7 *
N);
1677 StoreU(packed8,
d, packed_out + 8 *
N);
1678 StoreU(packed9,
d, packed_out + 9 *
N);
1684 using VU16 =
Vec<
decltype(
d)>;
1698 const VU16 mask =
Set(
d, 0x3FFu);
1700 const VU16 raw0 =
And(packed0, mask);
1703 const VU16 raw1 =
And(packed1, mask);
1706 const VU16 raw2 =
And(packed2, mask);
1709 const VU16 raw3 =
And(packed3, mask);
1712 const VU16 raw4 =
And(packed4, mask);
1715 const VU16 raw5 =
And(packed5, mask);
1718 const VU16 raw6 =
And(packed6, mask);
1721 const VU16 raw7 =
And(packed7, mask);
1724 const VU16 mid4 =
Set(
d, 0x3C0u);
1758 using VU16 =
Vec<
decltype(
d)>;
1760 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1761 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1762 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1763 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1764 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1765 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1766 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1767 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1768 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1769 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1770 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1771 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1772 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1773 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1774 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1775 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1781 const VU16 lo8 =
Set(
d, 0xFFu);
1793 StoreU(packed0,
d, packed_out + 0 *
N);
1794 StoreU(packed1,
d, packed_out + 1 *
N);
1795 StoreU(packed2,
d, packed_out + 2 *
N);
1796 StoreU(packed3,
d, packed_out + 3 *
N);
1797 StoreU(packed4,
d, packed_out + 4 *
N);
1798 StoreU(packed5,
d, packed_out + 5 *
N);
1799 StoreU(packed6,
d, packed_out + 6 *
N);
1800 StoreU(packed7,
d, packed_out + 7 *
N);
1808 VU16 next =
Set(
d, 0x38u);
1817 packed8 =
OrAnd(packed8, Add(raw9, raw9), next);
1818 packed9 =
OrAnd(packed9, Add(rawA, rawA), next);
1819 packedA =
OrAnd(packedA, Add(rawB, rawB), next);
1831 StoreU(packed8,
d, packed_out + 8 *
N);
1832 StoreU(packed9,
d, packed_out + 9 *
N);
1833 StoreU(packedA,
d, packed_out + 0xA *
N);
1839 using VU16 =
Vec<
decltype(
d)>;
1854 const VU16 mask =
Set(
d, 0xFFu);
1856 const VU16 down0 =
And(packed0, mask);
1858 const VU16 down2 =
And(packed1, mask);
1860 const VU16 down4 =
And(packed2, mask);
1862 const VU16 down6 =
And(packed3, mask);
1864 const VU16 down8 =
And(packed4, mask);
1866 const VU16 downA =
And(packed5, mask);
1868 const VU16 downC =
And(packed6, mask);
1870 const VU16 downE =
And(packed7, mask);
1874 const VU16 hi3 =
Set(
d, 0x700u);
1924 using VU16 =
Vec<
decltype(
d)>;
1926 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
1927 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
1928 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
1929 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
1930 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
1931 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
1932 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
1933 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
1934 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
1935 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
1936 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
1937 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
1938 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
1939 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
1940 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
1941 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
1955 const VU16 hi8 =
Set(
d, 0xFF00u);
1960 StoreU(packed0,
d, packed_out + 0 *
N);
1961 StoreU(packed1,
d, packed_out + 1 *
N);
1962 StoreU(packed2,
d, packed_out + 2 *
N);
1963 StoreU(packed3,
d, packed_out + 3 *
N);
1964 StoreU(packed4,
d, packed_out + 4 *
N);
1965 StoreU(packed5,
d, packed_out + 5 *
N);
1966 StoreU(packed6,
d, packed_out + 6 *
N);
1967 StoreU(packed7,
d, packed_out + 7 *
N);
1968 StoreU(packed8,
d, packed_out + 8 *
N);
1969 StoreU(packed9,
d, packed_out + 9 *
N);
1970 StoreU(packedA,
d, packed_out + 0xA *
N);
1971 StoreU(packedB,
d, packed_out + 0xB *
N);
1977 using VU16 =
Vec<
decltype(
d)>;
1993 const VU16 mask =
Set(
d, 0xFFFu);
1995 const VU16 raw0 =
And(packed0, mask);
1998 const VU16 raw1 =
And(packed1, mask);
2001 const VU16 raw2 =
And(packed2, mask);
2004 const VU16 raw3 =
And(packed3, mask);
2007 const VU16 raw4 =
And(packed4, mask);
2010 const VU16 raw5 =
And(packed5, mask);
2013 const VU16 raw6 =
And(packed6, mask);
2016 const VU16 raw7 =
And(packed7, mask);
2019 const VU16 mid8 =
Set(
d, 0xFF0u);
2052 using VU16 =
Vec<
decltype(
d)>;
2054 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2055 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2056 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2057 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2058 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2059 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2060 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2061 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2062 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2063 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2064 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2065 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2066 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2067 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2068 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2069 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2073 const VU16 lo8 =
Set(
d, 0xFFu);
2085 StoreU(packed0,
d, packed_out + 0 *
N);
2086 StoreU(packed1,
d, packed_out + 1 *
N);
2087 StoreU(packed2,
d, packed_out + 2 *
N);
2088 StoreU(packed3,
d, packed_out + 3 *
N);
2089 StoreU(packed4,
d, packed_out + 4 *
N);
2090 StoreU(packed5,
d, packed_out + 5 *
N);
2091 StoreU(packed6,
d, packed_out + 6 *
N);
2092 StoreU(packed7,
d, packed_out + 7 *
N);
2103 VU16 next =
Set(
d, 0x3E0u);
2124 StoreU(packed8,
d, packed_out + 8 *
N);
2125 StoreU(packed9,
d, packed_out + 9 *
N);
2126 StoreU(packedA,
d, packed_out + 0xA *
N);
2127 StoreU(packedB,
d, packed_out + 0xB *
N);
2128 StoreU(packedC,
d, packed_out + 0xC *
N);
2134 using VU16 =
Vec<
decltype(
d)>;
2151 const VU16 mask =
Set(
d, 0xFFu);
2153 const VU16 down0 =
And(packed0, mask);
2155 const VU16 down2 =
And(packed1, mask);
2157 const VU16 down4 =
And(packed2, mask);
2159 const VU16 down6 =
And(packed3, mask);
2161 const VU16 down8 =
And(packed4, mask);
2163 const VU16 downA =
And(packed5, mask);
2165 const VU16 downC =
And(packed6, mask);
2167 const VU16 downE =
And(packed7, mask);
2171 const VU16 hi5 =
Set(
d, 0x1F00u);
2196 const VU16 rawF =
Or(p0, p1);
2222 using VU16 =
Vec<
decltype(
d)>;
2224 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2225 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2226 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2227 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2228 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2229 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2230 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2231 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2232 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2233 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2234 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2235 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2236 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2237 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2238 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2239 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2243 const VU16 hi2 =
Set(
d, 0xC000u);
2259 StoreU(packed0,
d, packed_out + 0 *
N);
2260 StoreU(packed1,
d, packed_out + 1 *
N);
2261 StoreU(packed2,
d, packed_out + 2 *
N);
2262 StoreU(packed3,
d, packed_out + 3 *
N);
2263 StoreU(packed4,
d, packed_out + 4 *
N);
2264 StoreU(packed5,
d, packed_out + 5 *
N);
2265 StoreU(packed6,
d, packed_out + 6 *
N);
2266 StoreU(packed7,
d, packed_out + 7 *
N);
2267 StoreU(packed8,
d, packed_out + 8 *
N);
2268 StoreU(packed9,
d, packed_out + 9 *
N);
2269 StoreU(packedA,
d, packed_out + 0xA *
N);
2270 StoreU(packedB,
d, packed_out + 0xB *
N);
2271 StoreU(packedC,
d, packed_out + 0xC *
N);
2272 StoreU(packedD,
d, packed_out + 0xD *
N);
2278 using VU16 =
Vec<
decltype(
d)>;
2296 const VU16 mask =
Set(
d, 0x3FFFu);
2298 const VU16 raw0 =
And(packed0, mask);
2301 const VU16 raw1 =
And(packed1, mask);
2304 const VU16 raw2 =
And(packed2, mask);
2307 const VU16 raw3 =
And(packed3, mask);
2310 const VU16 raw4 =
And(packed4, mask);
2313 const VU16 raw5 =
And(packed5, mask);
2316 const VU16 raw6 =
And(packed6, mask);
2319 const VU16 raw7 =
And(packed7, mask);
2322 const VU16 raw8 =
And(packed8, mask);
2325 const VU16 raw9 =
And(packed9, mask);
2328 const VU16 rawA =
And(packedA, mask);
2331 const VU16 rawB =
And(packedB, mask);
2334 const VU16 rawC =
And(packedC, mask);
2337 const VU16 rawD =
And(packedD, mask);
2365 using VU16 =
Vec<
decltype(
d)>;
2367 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2368 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2369 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2370 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2371 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2372 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2373 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2374 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2375 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2376 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2377 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2378 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2379 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2380 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2381 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2382 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2386 const VU16 hi1 =
Set(
d, 0x8000u);
2403 StoreU(packed0,
d, packed_out + 0 *
N);
2404 StoreU(packed1,
d, packed_out + 1 *
N);
2405 StoreU(packed2,
d, packed_out + 2 *
N);
2406 StoreU(packed3,
d, packed_out + 3 *
N);
2407 StoreU(packed4,
d, packed_out + 4 *
N);
2408 StoreU(packed5,
d, packed_out + 5 *
N);
2409 StoreU(packed6,
d, packed_out + 6 *
N);
2410 StoreU(packed7,
d, packed_out + 7 *
N);
2411 StoreU(packed8,
d, packed_out + 8 *
N);
2412 StoreU(packed9,
d, packed_out + 9 *
N);
2413 StoreU(packedA,
d, packed_out + 0xA *
N);
2414 StoreU(packedB,
d, packed_out + 0xB *
N);
2415 StoreU(packedC,
d, packed_out + 0xC *
N);
2416 StoreU(packedD,
d, packed_out + 0xD *
N);
2417 StoreU(packedE,
d, packed_out + 0xE *
N);
2423 using VU16 =
Vec<
decltype(
d)>;
2442 const VU16 mask =
Set(
d, 0x7FFFu);
2444 const VU16 raw0 =
And(packed0, mask);
2447 const VU16 raw1 =
And(packed1, mask);
2450 const VU16 raw2 =
And(packed2, mask);
2453 const VU16 raw3 =
And(packed3, mask);
2456 const VU16 raw4 =
And(packed4, mask);
2459 const VU16 raw5 =
And(packed5, mask);
2462 const VU16 raw6 =
And(packed6, mask);
2465 const VU16 raw7 =
And(packed7, mask);
2468 const VU16 raw8 =
And(packed8, mask);
2471 const VU16 raw9 =
And(packed9, mask);
2474 const VU16 rawA =
And(packedA, mask);
2477 const VU16 rawB =
And(packedB, mask);
2480 const VU16 rawC =
And(packedC, mask);
2483 const VU16 rawD =
And(packedD, mask);
2486 const VU16 rawE =
And(packedE, mask);
2505 const VU16 rawF =
Xor3(F0, F1,
Xor3(F2, F3, F4));
2515 using VU16 =
Vec<
decltype(
d)>;
2517 const VU16 raw0 =
LoadU(
d, raw + 0 *
N);
2518 const VU16 raw1 =
LoadU(
d, raw + 1 *
N);
2519 const VU16 raw2 =
LoadU(
d, raw + 2 *
N);
2520 const VU16 raw3 =
LoadU(
d, raw + 3 *
N);
2521 const VU16 raw4 =
LoadU(
d, raw + 4 *
N);
2522 const VU16 raw5 =
LoadU(
d, raw + 5 *
N);
2523 const VU16 raw6 =
LoadU(
d, raw + 6 *
N);
2524 const VU16 raw7 =
LoadU(
d, raw + 7 *
N);
2525 const VU16 raw8 =
LoadU(
d, raw + 8 *
N);
2526 const VU16 raw9 =
LoadU(
d, raw + 9 *
N);
2527 const VU16 rawA =
LoadU(
d, raw + 0xA *
N);
2528 const VU16 rawB =
LoadU(
d, raw + 0xB *
N);
2529 const VU16 rawC =
LoadU(
d, raw + 0xC *
N);
2530 const VU16 rawD =
LoadU(
d, raw + 0xD *
N);
2531 const VU16 rawE =
LoadU(
d, raw + 0xE *
N);
2532 const VU16 rawF =
LoadU(
d, raw + 0xF *
N);
2534 StoreU(raw0,
d, packed_out + 0 *
N);
2535 StoreU(raw1,
d, packed_out + 1 *
N);
2536 StoreU(raw2,
d, packed_out + 2 *
N);
2537 StoreU(raw3,
d, packed_out + 3 *
N);
2538 StoreU(raw4,
d, packed_out + 4 *
N);
2539 StoreU(raw5,
d, packed_out + 5 *
N);
2540 StoreU(raw6,
d, packed_out + 6 *
N);
2541 StoreU(raw7,
d, packed_out + 7 *
N);
2542 StoreU(raw8,
d, packed_out + 8 *
N);
2543 StoreU(raw9,
d, packed_out + 9 *
N);
2544 StoreU(rawA,
d, packed_out + 0xA *
N);
2545 StoreU(rawB,
d, packed_out + 0xB *
N);
2546 StoreU(rawC,
d, packed_out + 0xC *
N);
2547 StoreU(rawD,
d, packed_out + 0xD *
N);
2548 StoreU(rawE,
d, packed_out + 0xE *
N);
2549 StoreU(rawF,
d, packed_out + 0xF *
N);
2555 using VU16 =
Vec<
decltype(
d)>;
#define HWY_RESTRICT
Definition base.h:64
#define HWY_INLINE
Definition base.h:70
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:40
N
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
Definition aligned_allocator.h:27
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1682
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1622
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1756
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1837
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1975
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1922
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2050
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2132
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2276
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2220
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2363
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2421
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2553
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2513
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:647
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:611
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:708
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:744
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:847
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:806
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:951
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:914
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1059
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1015
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1179
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1132
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1303
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1253
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1381
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1424
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1549
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1492
Definition bit_pack-inl.h:39
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:67
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:44
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:129
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:105
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:198
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:168
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:271
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:244
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:312
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:347
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:428
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:396
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:475
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:509
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:584
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:560
Definition bit_pack-inl.h:37