Grok 10.0.5
bit_pack-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target include guard
17#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
21#else
22#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
23#endif
24
25#include "hwy/highway.h"
26
28namespace hwy {
29namespace HWY_NAMESPACE {
30
31// The entry points are class templates specialized below for each number of
32// bits. Each provides Pack and Unpack member functions which load (Pack) or
33// store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of
34// packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16
35// for Pack16, which is also the upper bound for kBits.
36template <size_t kBits> // <= 8
37struct Pack8 {};
38template <size_t kBits> // <= 16
39struct Pack16 {};
40
41template <>
42struct Pack8<1> {
43 template <class D8>
44 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
45 uint8_t* HWY_RESTRICT packed_out) const {
46 const RepartitionToWide<decltype(d8)> d16;
47 using VU16 = Vec<decltype(d16)>;
48 const size_t N8 = Lanes(d8);
49 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
50 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
51 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
52 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
53 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
54 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
55 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
56 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
57 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
58
59 const VU16 packed =
60 Xor3(Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)),
61 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)),
62 Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0));
63 StoreU(BitCast(d8, packed), d8, packed_out);
64 }
65
66 template <class D8>
67 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
68 uint8_t* HWY_RESTRICT raw) const {
69 const RepartitionToWide<decltype(d8)> d16;
70 using VU16 = Vec<decltype(d16)>;
71 const size_t N8 = Lanes(d8);
72 const VU16 mask = Set(d16, 0x0101u); // LSB in each byte
73
74 const VU16 packed = BitCast(d16, LoadU(d8, packed_in));
75
76 const VU16 raw0 = And(packed, mask);
77 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
78
79 const VU16 raw1 = And(ShiftRight<1>(packed), mask);
80 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
81
82 const VU16 raw2 = And(ShiftRight<2>(packed), mask);
83 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
84
85 const VU16 raw3 = And(ShiftRight<3>(packed), mask);
86 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
87
88 const VU16 raw4 = And(ShiftRight<4>(packed), mask);
89 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
90
91 const VU16 raw5 = And(ShiftRight<5>(packed), mask);
92 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
93
94 const VU16 raw6 = And(ShiftRight<6>(packed), mask);
95 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
96
97 const VU16 raw7 = And(ShiftRight<7>(packed), mask);
98 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
99 }
100}; // Pack8<1>
101
102template <>
103struct Pack8<2> {
104 template <class D8>
105 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
106 uint8_t* HWY_RESTRICT packed_out) const {
107 const RepartitionToWide<decltype(d8)> d16;
108 using VU16 = Vec<decltype(d16)>;
109 const size_t N8 = Lanes(d8);
110 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
111 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
112 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
113 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
114 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
115 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
116 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
117 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
118 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
119
120 const VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4),
121 Or(ShiftLeft<2>(raw2), raw0));
122 const VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5),
123 Or(ShiftLeft<2>(raw3), raw1));
124 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
125 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
126 }
127
128 template <class D8>
129 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
130 uint8_t* HWY_RESTRICT raw) const {
131 const RepartitionToWide<decltype(d8)> d16;
132 using VU16 = Vec<decltype(d16)>;
133 const size_t N8 = Lanes(d8);
134 const VU16 mask = Set(d16, 0x0303u); // Lowest 2 bits per byte
135
136 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
137 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
138
139 const VU16 raw0 = And(packed0, mask);
140 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
141
142 const VU16 raw1 = And(packed1, mask);
143 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
144
145 const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
146 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
147
148 const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
149 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
150
151 const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
152 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
153
154 const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
155 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
156
157 const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
158 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
159
160 const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
161 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
162 }
163}; // Pack8<2>
164
165template <>
166struct Pack8<3> {
167 template <class D8>
168 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
169 uint8_t* HWY_RESTRICT packed_out) const {
170 const RepartitionToWide<decltype(d8)> d16;
171 using VU16 = Vec<decltype(d16)>;
172 const size_t N8 = Lanes(d8);
173 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
174 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
175 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
176 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
177 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
178 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
179 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
180 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
181
182 // The upper two bits of these three will be filled with packed3 (6 bits).
183 VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0);
184 VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1);
185 VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2);
186 const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3);
187
188 const VU16 hi2 = Set(d16, 0xC0C0u);
189 packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2);
190 packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2);
191 packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2);
192 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
193 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
194 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
195 }
196
197 template <class D8>
198 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
199 uint8_t* HWY_RESTRICT raw) const {
200 const RepartitionToWide<decltype(d8)> d16;
201 using VU16 = Vec<decltype(d16)>;
202 const size_t N8 = Lanes(d8);
203 const VU16 mask = Set(d16, 0x0707u); // Lowest 3 bits per byte
204
205 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
206 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
207 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
208
209 const VU16 raw0 = And(packed0, mask);
210 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
211
212 const VU16 raw1 = And(packed1, mask);
213 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
214
215 const VU16 raw2 = And(packed2, mask);
216 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
217
218 const VU16 raw4 = And(ShiftRight<3>(packed0), mask);
219 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
220
221 const VU16 raw5 = And(ShiftRight<3>(packed1), mask);
222 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
223
224 const VU16 raw6 = And(ShiftRight<3>(packed2), mask);
225 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
226
227 // raw73 is the concatenation of the upper two bits in packed0..2.
228 const VU16 hi2 = Set(d16, 0xC0C0u);
229 const VU16 raw73 = Xor3(ShiftRight<6>(And(packed2, hi2)), //
230 ShiftRight<4>(And(packed1, hi2)),
231 ShiftRight<2>(And(packed0, hi2)));
232
233 const VU16 raw3 = And(mask, raw73);
234 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
235
236 const VU16 raw7 = And(mask, ShiftRight<3>(raw73));
237 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
238 }
239}; // Pack8<3>
240
241template <>
242struct Pack8<4> {
243 template <class D8>
244 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
245 uint8_t* HWY_RESTRICT packed_out) const {
246 const RepartitionToWide<decltype(d8)> d16;
247 using VU16 = Vec<decltype(d16)>;
248 const size_t N8 = Lanes(d8);
249 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
250 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
251 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
252 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
253 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
254 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
255 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
256 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
257 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
258
259 const VU16 packed0 = Or(ShiftLeft<4>(raw2), raw0);
260 const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw1);
261 const VU16 packed2 = Or(ShiftLeft<4>(raw6), raw4);
262 const VU16 packed3 = Or(ShiftLeft<4>(raw7), raw5);
263
264 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
265 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
266 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
267 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
268 }
269
270 template <class D8>
271 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
272 uint8_t* HWY_RESTRICT raw) const {
273 const RepartitionToWide<decltype(d8)> d16;
274 using VU16 = Vec<decltype(d16)>;
275 const size_t N8 = Lanes(d8);
276 const VU16 mask = Set(d16, 0x0F0Fu); // Lowest 4 bits per byte
277
278 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
279 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
280 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
281 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
282
283 const VU16 raw0 = And(packed0, mask);
284 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
285
286 const VU16 raw1 = And(packed1, mask);
287 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
288
289 const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
290 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
291
292 const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
293 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
294
295 const VU16 raw4 = And(packed2, mask);
296 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
297
298 const VU16 raw5 = And(packed3, mask);
299 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
300
301 const VU16 raw6 = And(ShiftRight<4>(packed2), mask);
302 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
303
304 const VU16 raw7 = And(ShiftRight<4>(packed3), mask);
305 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
306 }
307}; // Pack8<4>
308
309template <>
310struct Pack8<5> {
311 template <class D8>
312 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
313 uint8_t* HWY_RESTRICT packed_out) const {
314 const RepartitionToWide<decltype(d8)> d16;
315 using VU16 = Vec<decltype(d16)>;
316 const size_t N8 = Lanes(d8);
317 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
318 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
319 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
320 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
321 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
322 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
323 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
324 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
325
326 // Fill upper three bits with upper bits from raw4..7.
327 const VU16 hi3 = Set(d16, 0xE0E0u);
328 const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3);
329 const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3);
330 const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3);
331 const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3);
332
333 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
334 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
335 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
336 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
337
338 // Combine lower two bits of raw4..7 into packed4.
339 const VU16 lo2 = Set(d16, 0x0303u);
340 const VU16 packed4 = Or(And(raw4, lo2), Xor3(ShiftLeft<2>(And(raw5, lo2)),
341 ShiftLeft<4>(And(raw6, lo2)),
342 ShiftLeft<6>(And(raw7, lo2))));
343 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
344 }
345
346 template <class D8>
347 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
348 uint8_t* HWY_RESTRICT raw) const {
349 const RepartitionToWide<decltype(d8)> d16;
350 using VU16 = Vec<decltype(d16)>;
351 const size_t N8 = Lanes(d8);
352
353 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
354 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
355 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
356 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
357 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
358
359 const VU16 mask = Set(d16, 0x1F1Fu); // Lowest 5 bits per byte
360
361 const VU16 raw0 = And(packed0, mask);
362 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
363
364 const VU16 raw1 = And(packed1, mask);
365 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
366
367 const VU16 raw2 = And(packed2, mask);
368 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
369
370 const VU16 raw3 = And(packed3, mask);
371 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
372
373 // The upper bits are the top 3 bits shifted right by three.
374 const VU16 top4 = ShiftRight<3>(AndNot(mask, packed0));
375 const VU16 top5 = ShiftRight<3>(AndNot(mask, packed1));
376 const VU16 top6 = ShiftRight<3>(AndNot(mask, packed2));
377 const VU16 top7 = ShiftRight<3>(AndNot(mask, packed3));
378
379 // Insert the lower 2 bits, which were concatenated into a byte.
380 const VU16 lo2 = Set(d16, 0x0303u);
381 const VU16 raw4 = OrAnd(top4, lo2, packed4);
382 const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<2>(packed4));
383 const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<4>(packed4));
384 const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<6>(packed4));
385
386 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
387 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
388 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
389 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
390 }
391}; // Pack8<5>
392
393template <>
394struct Pack8<6> {
395 template <class D8>
396 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
397 uint8_t* HWY_RESTRICT packed_out) const {
398 const RepartitionToWide<decltype(d8)> d16;
399 using VU16 = Vec<decltype(d16)>;
400 const size_t N8 = Lanes(d8);
401 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
402 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
403 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
404 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
405 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
406 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
407 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
408 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
409
410 const VU16 hi2 = Set(d16, 0xC0C0u);
411 // Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits.
412 const VU16 packed0 = OrAnd(raw0, ShiftLeft<2>(raw3), hi2);
413 const VU16 packed1 = OrAnd(raw1, ShiftLeft<4>(raw3), hi2);
414 const VU16 packed2 = OrAnd(raw2, ShiftLeft<6>(raw3), hi2);
415 const VU16 packed3 = OrAnd(raw4, ShiftLeft<2>(raw7), hi2);
416 const VU16 packed4 = OrAnd(raw5, ShiftLeft<4>(raw7), hi2);
417 const VU16 packed5 = OrAnd(raw6, ShiftLeft<6>(raw7), hi2);
418
419 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
420 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
421 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
422 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
423 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
424 StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
425 }
426
427 template <class D8>
428 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
429 uint8_t* HWY_RESTRICT raw) const {
430 const RepartitionToWide<decltype(d8)> d16;
431 using VU16 = Vec<decltype(d16)>;
432 const size_t N8 = Lanes(d8);
433 const VU16 mask = Set(d16, 0x3F3Fu); // Lowest 6 bits per byte
434
435 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
436 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
437 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
438 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
439 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
440 const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
441
442 const VU16 raw0 = And(packed0, mask);
443 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
444
445 const VU16 raw1 = And(packed1, mask);
446 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
447
448 const VU16 raw2 = And(packed2, mask);
449 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
450
451 const VU16 raw4 = And(packed3, mask);
452 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
453
454 const VU16 raw5 = And(packed4, mask);
455 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
456
457 const VU16 raw6 = And(packed5, mask);
458 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
459
460 // raw3/7 are the concatenation of the upper two bits in packed0..2.
461 const VU16 raw3 = Xor3(ShiftRight<6>(AndNot(mask, packed2)),
462 ShiftRight<4>(AndNot(mask, packed1)),
463 ShiftRight<2>(AndNot(mask, packed0)));
464 const VU16 raw7 = Xor3(ShiftRight<6>(AndNot(mask, packed5)),
465 ShiftRight<4>(AndNot(mask, packed4)),
466 ShiftRight<2>(AndNot(mask, packed3)));
467 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
468 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
469 }
470}; // Pack8<6>
471
472template <>
473struct Pack8<7> {
474 template <class D8>
475 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
476 uint8_t* HWY_RESTRICT packed_out) const {
477 const RepartitionToWide<decltype(d8)> d16;
478 using VU16 = Vec<decltype(d16)>;
479 const size_t N8 = Lanes(d8);
480 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
481 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
482 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
483 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
484 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
485 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
486 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
487 // Inserted into top bit of packed0..6.
488 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
489
490 const VU16 hi1 = Set(d16, 0x8080u);
491 const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1);
492 const VU16 packed1 = OrAnd(raw1, ShiftLeft<2>(raw7), hi1);
493 const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw7), hi1);
494 const VU16 packed3 = OrAnd(raw3, ShiftLeft<4>(raw7), hi1);
495 const VU16 packed4 = OrAnd(raw4, ShiftLeft<5>(raw7), hi1);
496 const VU16 packed5 = OrAnd(raw5, ShiftLeft<6>(raw7), hi1);
497 const VU16 packed6 = OrAnd(raw6, ShiftLeft<7>(raw7), hi1);
498
499 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
500 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
501 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
502 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
503 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
504 StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
505 StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8);
506 }
507
508 template <class D8>
509 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
510 uint8_t* HWY_RESTRICT raw) const {
511 const RepartitionToWide<decltype(d8)> d16;
512 using VU16 = Vec<decltype(d16)>;
513 const size_t N8 = Lanes(d8);
514
515 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
516 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
517 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
518 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
519 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
520 const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
521 const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8));
522
523 const VU16 mask = Set(d16, 0x7F7Fu); // Lowest 7 bits per byte
524
525 const VU16 raw0 = And(packed0, mask);
526 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
527
528 const VU16 raw1 = And(packed1, mask);
529 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
530
531 const VU16 raw2 = And(packed2, mask);
532 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
533
534 const VU16 raw3 = And(packed3, mask);
535 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
536
537 const VU16 raw4 = And(packed4, mask);
538 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
539
540 const VU16 raw5 = And(packed5, mask);
541 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
542
543 const VU16 raw6 = And(packed6, mask);
544 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
545
546 const VU16 p0 = Xor3(ShiftRight<7>(AndNot(mask, packed6)),
547 ShiftRight<6>(AndNot(mask, packed5)),
548 ShiftRight<5>(AndNot(mask, packed4)));
549 const VU16 p1 = Xor3(ShiftRight<4>(AndNot(mask, packed3)),
550 ShiftRight<3>(AndNot(mask, packed2)),
551 ShiftRight<2>(AndNot(mask, packed1)));
552 const VU16 raw7 = Xor3(ShiftRight<1>(AndNot(mask, packed0)), p0, p1);
553 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
554 }
555}; // Pack8<7>
556
557template <>
558struct Pack8<8> {
559 template <class D8>
560 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
561 uint8_t* HWY_RESTRICT packed_out) const {
562 using VU8 = Vec<decltype(d8)>;
563 const size_t N8 = Lanes(d8);
564 const VU8 raw0 = LoadU(d8, raw + 0 * N8);
565 const VU8 raw1 = LoadU(d8, raw + 1 * N8);
566 const VU8 raw2 = LoadU(d8, raw + 2 * N8);
567 const VU8 raw3 = LoadU(d8, raw + 3 * N8);
568 const VU8 raw4 = LoadU(d8, raw + 4 * N8);
569 const VU8 raw5 = LoadU(d8, raw + 5 * N8);
570 const VU8 raw6 = LoadU(d8, raw + 6 * N8);
571 const VU8 raw7 = LoadU(d8, raw + 7 * N8);
572
573 StoreU(raw0, d8, packed_out + 0 * N8);
574 StoreU(raw1, d8, packed_out + 1 * N8);
575 StoreU(raw2, d8, packed_out + 2 * N8);
576 StoreU(raw3, d8, packed_out + 3 * N8);
577 StoreU(raw4, d8, packed_out + 4 * N8);
578 StoreU(raw5, d8, packed_out + 5 * N8);
579 StoreU(raw6, d8, packed_out + 6 * N8);
580 StoreU(raw7, d8, packed_out + 7 * N8);
581 }
582
583 template <class D8>
584 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
585 uint8_t* HWY_RESTRICT raw) const {
586 using VU8 = Vec<decltype(d8)>;
587 const size_t N8 = Lanes(d8);
588 const VU8 raw0 = LoadU(d8, packed_in + 0 * N8);
589 const VU8 raw1 = LoadU(d8, packed_in + 1 * N8);
590 const VU8 raw2 = LoadU(d8, packed_in + 2 * N8);
591 const VU8 raw3 = LoadU(d8, packed_in + 3 * N8);
592 const VU8 raw4 = LoadU(d8, packed_in + 4 * N8);
593 const VU8 raw5 = LoadU(d8, packed_in + 5 * N8);
594 const VU8 raw6 = LoadU(d8, packed_in + 6 * N8);
595 const VU8 raw7 = LoadU(d8, packed_in + 7 * N8);
596
597 StoreU(raw0, d8, raw + 0 * N8);
598 StoreU(raw1, d8, raw + 1 * N8);
599 StoreU(raw2, d8, raw + 2 * N8);
600 StoreU(raw3, d8, raw + 3 * N8);
601 StoreU(raw4, d8, raw + 4 * N8);
602 StoreU(raw5, d8, raw + 5 * N8);
603 StoreU(raw6, d8, raw + 6 * N8);
604 StoreU(raw7, d8, raw + 7 * N8);
605 }
606}; // Pack8<8>
607
608template <>
609struct Pack16<1> {
610 template <class D>
611 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
612 uint16_t* HWY_RESTRICT packed_out) const {
613 using VU16 = Vec<decltype(d)>;
614 const size_t N = Lanes(d);
615 const VU16 raw0 = LoadU(d, raw + 0 * N);
616 const VU16 raw1 = LoadU(d, raw + 1 * N);
617 const VU16 raw2 = LoadU(d, raw + 2 * N);
618 const VU16 raw3 = LoadU(d, raw + 3 * N);
619 const VU16 raw4 = LoadU(d, raw + 4 * N);
620 const VU16 raw5 = LoadU(d, raw + 5 * N);
621 const VU16 raw6 = LoadU(d, raw + 6 * N);
622 const VU16 raw7 = LoadU(d, raw + 7 * N);
623 const VU16 raw8 = LoadU(d, raw + 8 * N);
624 const VU16 raw9 = LoadU(d, raw + 9 * N);
625 const VU16 rawA = LoadU(d, raw + 0xA * N);
626 const VU16 rawB = LoadU(d, raw + 0xB * N);
627 const VU16 rawC = LoadU(d, raw + 0xC * N);
628 const VU16 rawD = LoadU(d, raw + 0xD * N);
629 const VU16 rawE = LoadU(d, raw + 0xE * N);
630 const VU16 rawF = LoadU(d, raw + 0xF * N);
631
632 const VU16 p0 = Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0);
633 const VU16 p1 =
634 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3));
635 const VU16 p2 =
636 Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6));
637 const VU16 p3 =
638 Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9));
639 const VU16 p4 =
641 const VU16 packed =
642 Or(Xor3(ShiftLeft<0xF>(rawF), p0, p1), Xor3(p2, p3, p4));
643 StoreU(packed, d, packed_out);
644 }
645
646 template <class D>
647 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
648 uint16_t* HWY_RESTRICT raw) const {
649 using VU16 = Vec<decltype(d)>;
650 const size_t N = Lanes(d);
651 const VU16 mask = Set(d, 1u); // Lowest bit
652
653 const VU16 packed = LoadU(d, packed_in);
654
655 const VU16 raw0 = And(packed, mask);
656 StoreU(raw0, d, raw + 0 * N);
657
658 const VU16 raw1 = And(ShiftRight<1>(packed), mask);
659 StoreU(raw1, d, raw + 1 * N);
660
661 const VU16 raw2 = And(ShiftRight<2>(packed), mask);
662 StoreU(raw2, d, raw + 2 * N);
663
664 const VU16 raw3 = And(ShiftRight<3>(packed), mask);
665 StoreU(raw3, d, raw + 3 * N);
666
667 const VU16 raw4 = And(ShiftRight<4>(packed), mask);
668 StoreU(raw4, d, raw + 4 * N);
669
670 const VU16 raw5 = And(ShiftRight<5>(packed), mask);
671 StoreU(raw5, d, raw + 5 * N);
672
673 const VU16 raw6 = And(ShiftRight<6>(packed), mask);
674 StoreU(raw6, d, raw + 6 * N);
675
676 const VU16 raw7 = And(ShiftRight<7>(packed), mask);
677 StoreU(raw7, d, raw + 7 * N);
678
679 const VU16 raw8 = And(ShiftRight<8>(packed), mask);
680 StoreU(raw8, d, raw + 8 * N);
681
682 const VU16 raw9 = And(ShiftRight<9>(packed), mask);
683 StoreU(raw9, d, raw + 9 * N);
684
685 const VU16 rawA = And(ShiftRight<0xA>(packed), mask);
686 StoreU(rawA, d, raw + 0xA * N);
687
688 const VU16 rawB = And(ShiftRight<0xB>(packed), mask);
689 StoreU(rawB, d, raw + 0xB * N);
690
691 const VU16 rawC = And(ShiftRight<0xC>(packed), mask);
692 StoreU(rawC, d, raw + 0xC * N);
693
694 const VU16 rawD = And(ShiftRight<0xD>(packed), mask);
695 StoreU(rawD, d, raw + 0xD * N);
696
697 const VU16 rawE = And(ShiftRight<0xE>(packed), mask);
698 StoreU(rawE, d, raw + 0xE * N);
699
700 const VU16 rawF = ShiftRight<0xF>(packed);
701 StoreU(rawF, d, raw + 0xF * N);
702 }
703}; // Pack16<1>
704
705template <>
706struct Pack16<2> {
707 template <class D>
708 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
709 uint16_t* HWY_RESTRICT packed_out) const {
710 using VU16 = Vec<decltype(d)>;
711 const size_t N = Lanes(d);
712 const VU16 raw0 = LoadU(d, raw + 0 * N);
713 const VU16 raw1 = LoadU(d, raw + 1 * N);
714 const VU16 raw2 = LoadU(d, raw + 2 * N);
715 const VU16 raw3 = LoadU(d, raw + 3 * N);
716 const VU16 raw4 = LoadU(d, raw + 4 * N);
717 const VU16 raw5 = LoadU(d, raw + 5 * N);
718 const VU16 raw6 = LoadU(d, raw + 6 * N);
719 const VU16 raw7 = LoadU(d, raw + 7 * N);
720 const VU16 raw8 = LoadU(d, raw + 8 * N);
721 const VU16 raw9 = LoadU(d, raw + 9 * N);
722 const VU16 rawA = LoadU(d, raw + 0xA * N);
723 const VU16 rawB = LoadU(d, raw + 0xB * N);
724 const VU16 rawC = LoadU(d, raw + 0xC * N);
725 const VU16 rawD = LoadU(d, raw + 0xD * N);
726 const VU16 rawE = LoadU(d, raw + 0xE * N);
727 const VU16 rawF = LoadU(d, raw + 0xF * N);
728
729 VU16 packed0 = Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0);
730 VU16 packed1 = Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1);
731 packed0 = Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6));
732 packed1 = Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7));
733
734 packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA));
735 packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB));
736
737 packed0 = Or(packed0, ShiftLeft<14>(rawE));
738 packed1 = Or(packed1, ShiftLeft<14>(rawF));
739 StoreU(packed0, d, packed_out + 0 * N);
740 StoreU(packed1, d, packed_out + 1 * N);
741 }
742
743 template <class D>
744 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
745 uint16_t* HWY_RESTRICT raw) const {
746 using VU16 = Vec<decltype(d)>;
747 const size_t N = Lanes(d);
748 const VU16 mask = Set(d, 0x3u); // Lowest 2 bits
749
750 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
751 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
752
753 const VU16 raw0 = And(packed0, mask);
754 StoreU(raw0, d, raw + 0 * N);
755
756 const VU16 raw1 = And(packed1, mask);
757 StoreU(raw1, d, raw + 1 * N);
758
759 const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
760 StoreU(raw2, d, raw + 2 * N);
761
762 const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
763 StoreU(raw3, d, raw + 3 * N);
764
765 const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
766 StoreU(raw4, d, raw + 4 * N);
767
768 const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
769 StoreU(raw5, d, raw + 5 * N);
770
771 const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
772 StoreU(raw6, d, raw + 6 * N);
773
774 const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
775 StoreU(raw7, d, raw + 7 * N);
776
777 const VU16 raw8 = And(ShiftRight<8>(packed0), mask);
778 StoreU(raw8, d, raw + 8 * N);
779
780 const VU16 raw9 = And(ShiftRight<8>(packed1), mask);
781 StoreU(raw9, d, raw + 9 * N);
782
783 const VU16 rawA = And(ShiftRight<0xA>(packed0), mask);
784 StoreU(rawA, d, raw + 0xA * N);
785
786 const VU16 rawB = And(ShiftRight<0xA>(packed1), mask);
787 StoreU(rawB, d, raw + 0xB * N);
788
789 const VU16 rawC = And(ShiftRight<0xC>(packed0), mask);
790 StoreU(rawC, d, raw + 0xC * N);
791
792 const VU16 rawD = And(ShiftRight<0xC>(packed1), mask);
793 StoreU(rawD, d, raw + 0xD * N);
794
795 const VU16 rawE = ShiftRight<0xE>(packed0);
796 StoreU(rawE, d, raw + 0xE * N);
797
798 const VU16 rawF = ShiftRight<0xE>(packed1);
799 StoreU(rawF, d, raw + 0xF * N);
800 }
801}; // Pack16<2>
802
803template <>
804struct Pack16<3> {
805 template <class D>
806 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
807 uint16_t* HWY_RESTRICT packed_out) const {
808 using VU16 = Vec<decltype(d)>;
809 const size_t N = Lanes(d);
810 const VU16 raw0 = LoadU(d, raw + 0 * N);
811 const VU16 raw1 = LoadU(d, raw + 1 * N);
812 const VU16 raw2 = LoadU(d, raw + 2 * N);
813 const VU16 raw3 = LoadU(d, raw + 3 * N);
814 const VU16 raw4 = LoadU(d, raw + 4 * N);
815 const VU16 raw5 = LoadU(d, raw + 5 * N);
816 const VU16 raw6 = LoadU(d, raw + 6 * N);
817 const VU16 raw7 = LoadU(d, raw + 7 * N);
818 const VU16 raw8 = LoadU(d, raw + 8 * N);
819 const VU16 raw9 = LoadU(d, raw + 9 * N);
820 const VU16 rawA = LoadU(d, raw + 0xA * N);
821 const VU16 rawB = LoadU(d, raw + 0xB * N);
822 const VU16 rawC = LoadU(d, raw + 0xC * N);
823 const VU16 rawD = LoadU(d, raw + 0xD * N);
824 const VU16 rawE = LoadU(d, raw + 0xE * N);
825 const VU16 rawF = LoadU(d, raw + 0xF * N);
826
827 // We can fit 15 raw vectors in three packed vectors (five each).
828 VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0);
829 VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1);
830 VU16 packed2 = Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2);
831
832 // rawF will be scattered into the upper bit of these three.
833 packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9));
834 packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA));
835 packed2 = Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB));
836
837 const VU16 hi1 = Set(d, 0x8000u);
838 packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask
839 packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
840 packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
841 StoreU(packed0, d, packed_out + 0 * N);
842 StoreU(packed1, d, packed_out + 1 * N);
843 StoreU(packed2, d, packed_out + 2 * N);
844 }
845
846 template <class D>
847 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
848 uint16_t* HWY_RESTRICT raw) const {
849 using VU16 = Vec<decltype(d)>;
850 const size_t N = Lanes(d);
851 const VU16 mask = Set(d, 0x7u); // Lowest 3 bits
852
853 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
854 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
855 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
856
857 const VU16 raw0 = And(mask, packed0);
858 StoreU(raw0, d, raw + 0 * N);
859
860 const VU16 raw1 = And(mask, packed1);
861 StoreU(raw1, d, raw + 1 * N);
862
863 const VU16 raw2 = And(mask, packed2);
864 StoreU(raw2, d, raw + 2 * N);
865
866 const VU16 raw3 = And(mask, ShiftRight<3>(packed0));
867 StoreU(raw3, d, raw + 3 * N);
868
869 const VU16 raw4 = And(mask, ShiftRight<3>(packed1));
870 StoreU(raw4, d, raw + 4 * N);
871
872 const VU16 raw5 = And(mask, ShiftRight<3>(packed2));
873 StoreU(raw5, d, raw + 5 * N);
874
875 const VU16 raw6 = And(mask, ShiftRight<6>(packed0));
876 StoreU(raw6, d, raw + 6 * N);
877
878 const VU16 raw7 = And(mask, ShiftRight<6>(packed1));
879 StoreU(raw7, d, raw + 7 * N);
880
881 const VU16 raw8 = And(mask, ShiftRight<6>(packed2));
882 StoreU(raw8, d, raw + 8 * N);
883
884 const VU16 raw9 = And(mask, ShiftRight<9>(packed0));
885 StoreU(raw9, d, raw + 9 * N);
886
887 const VU16 rawA = And(mask, ShiftRight<9>(packed1));
888 StoreU(rawA, d, raw + 0xA * N);
889
890 const VU16 rawB = And(mask, ShiftRight<9>(packed2));
891 StoreU(rawB, d, raw + 0xB * N);
892
893 const VU16 rawC = And(mask, ShiftRight<12>(packed0));
894 StoreU(rawC, d, raw + 0xC * N);
895
896 const VU16 rawD = And(mask, ShiftRight<12>(packed1));
897 StoreU(rawD, d, raw + 0xD * N);
898
899 const VU16 rawE = And(mask, ShiftRight<12>(packed2));
900 StoreU(rawE, d, raw + 0xE * N);
901
902 // rawF is the concatenation of the upper bit of packed0..2.
903 const VU16 down0 = ShiftRight<15>(packed0);
904 const VU16 down1 = ShiftRight<15>(packed1);
905 const VU16 down2 = ShiftRight<15>(packed2);
906 const VU16 rawF = Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0);
907 StoreU(rawF, d, raw + 0xF * N);
908 }
909}; // Pack16<3>
910
911template <>
912struct Pack16<4> {
913 template <class D>
914 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
915 uint16_t* HWY_RESTRICT packed_out) const {
916 using VU16 = Vec<decltype(d)>;
917 const size_t N = Lanes(d);
918 const VU16 raw0 = LoadU(d, raw + 0 * N);
919 const VU16 raw1 = LoadU(d, raw + 1 * N);
920 const VU16 raw2 = LoadU(d, raw + 2 * N);
921 const VU16 raw3 = LoadU(d, raw + 3 * N);
922 const VU16 raw4 = LoadU(d, raw + 4 * N);
923 const VU16 raw5 = LoadU(d, raw + 5 * N);
924 const VU16 raw6 = LoadU(d, raw + 6 * N);
925 const VU16 raw7 = LoadU(d, raw + 7 * N);
926 const VU16 raw8 = LoadU(d, raw + 8 * N);
927 const VU16 raw9 = LoadU(d, raw + 9 * N);
928 const VU16 rawA = LoadU(d, raw + 0xA * N);
929 const VU16 rawB = LoadU(d, raw + 0xB * N);
930 const VU16 rawC = LoadU(d, raw + 0xC * N);
931 const VU16 rawD = LoadU(d, raw + 0xD * N);
932 const VU16 rawE = LoadU(d, raw + 0xE * N);
933 const VU16 rawF = LoadU(d, raw + 0xF * N);
934
935 VU16 packed0 = Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0);
936 VU16 packed1 = Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1);
937 packed0 = Or(packed0, ShiftLeft<12>(raw6));
938 packed1 = Or(packed1, ShiftLeft<12>(raw7));
939 VU16 packed2 = Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8);
940 VU16 packed3 = Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9);
941 packed2 = Or(packed2, ShiftLeft<12>(rawE));
942 packed3 = Or(packed3, ShiftLeft<12>(rawF));
943
944 StoreU(packed0, d, packed_out + 0 * N);
945 StoreU(packed1, d, packed_out + 1 * N);
946 StoreU(packed2, d, packed_out + 2 * N);
947 StoreU(packed3, d, packed_out + 3 * N);
948 }
949
950 template <class D>
951 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
952 uint16_t* HWY_RESTRICT raw) const {
953 using VU16 = Vec<decltype(d)>;
954 const size_t N = Lanes(d);
955 const VU16 mask = Set(d, 0xFu); // Lowest 4 bits
956
957 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
958 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
959 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
960 const VU16 packed3 = LoadU(d, packed_in + 3 * N);
961
962 const VU16 raw0 = And(packed0, mask);
963 StoreU(raw0, d, raw + 0 * N);
964
965 const VU16 raw1 = And(packed1, mask);
966 StoreU(raw1, d, raw + 1 * N);
967
968 const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
969 StoreU(raw2, d, raw + 2 * N);
970
971 const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
972 StoreU(raw3, d, raw + 3 * N);
973
974 const VU16 raw4 = And(ShiftRight<8>(packed0), mask);
975 StoreU(raw4, d, raw + 4 * N);
976
977 const VU16 raw5 = And(ShiftRight<8>(packed1), mask);
978 StoreU(raw5, d, raw + 5 * N);
979
980 const VU16 raw6 = ShiftRight<12>(packed0); // no mask required
981 StoreU(raw6, d, raw + 6 * N);
982
983 const VU16 raw7 = ShiftRight<12>(packed1); // no mask required
984 StoreU(raw7, d, raw + 7 * N);
985
986 const VU16 raw8 = And(packed2, mask);
987 StoreU(raw8, d, raw + 8 * N);
988
989 const VU16 raw9 = And(packed3, mask);
990 StoreU(raw9, d, raw + 9 * N);
991
992 const VU16 rawA = And(ShiftRight<4>(packed2), mask);
993 StoreU(rawA, d, raw + 0xA * N);
994
995 const VU16 rawB = And(ShiftRight<4>(packed3), mask);
996 StoreU(rawB, d, raw + 0xB * N);
997
998 const VU16 rawC = And(ShiftRight<8>(packed2), mask);
999 StoreU(rawC, d, raw + 0xC * N);
1000
1001 const VU16 rawD = And(ShiftRight<8>(packed3), mask);
1002 StoreU(rawD, d, raw + 0xD * N);
1003
1004 const VU16 rawE = ShiftRight<12>(packed2); // no mask required
1005 StoreU(rawE, d, raw + 0xE * N);
1006
1007 const VU16 rawF = ShiftRight<12>(packed3); // no mask required
1008 StoreU(rawF, d, raw + 0xF * N);
1009 }
1010}; // Pack16<4>
1011
1012template <>
1013struct Pack16<5> {
1014 template <class D>
1015 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1016 uint16_t* HWY_RESTRICT packed_out) const {
1017 using VU16 = Vec<decltype(d)>;
1018 const size_t N = Lanes(d);
1019 const VU16 raw0 = LoadU(d, raw + 0 * N);
1020 const VU16 raw1 = LoadU(d, raw + 1 * N);
1021 const VU16 raw2 = LoadU(d, raw + 2 * N);
1022 const VU16 raw3 = LoadU(d, raw + 3 * N);
1023 const VU16 raw4 = LoadU(d, raw + 4 * N);
1024 const VU16 raw5 = LoadU(d, raw + 5 * N);
1025 const VU16 raw6 = LoadU(d, raw + 6 * N);
1026 const VU16 raw7 = LoadU(d, raw + 7 * N);
1027 const VU16 raw8 = LoadU(d, raw + 8 * N);
1028 const VU16 raw9 = LoadU(d, raw + 9 * N);
1029 const VU16 rawA = LoadU(d, raw + 0xA * N);
1030 const VU16 rawB = LoadU(d, raw + 0xB * N);
1031 const VU16 rawC = LoadU(d, raw + 0xC * N);
1032 const VU16 rawD = LoadU(d, raw + 0xD * N);
1033 const VU16 rawE = LoadU(d, raw + 0xE * N);
1034 const VU16 rawF = LoadU(d, raw + 0xF * N);
1035
1036 // We can fit 15 raw vectors in five packed vectors (three each).
1037 VU16 packed0 = Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0);
1038 VU16 packed1 = Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1);
1039 VU16 packed2 = Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2);
1040 VU16 packed3 = Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3);
1041 VU16 packed4 = Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4);
1042
1043 // rawF will be scattered into the upper bits of these five.
1044 const VU16 hi1 = Set(d, 0x8000u);
1045 packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask
1046 packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
1047 packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
1048 packed3 = OrAnd(packed3, ShiftLeft<12>(rawF), hi1);
1049 packed4 = OrAnd(packed4, ShiftLeft<11>(rawF), hi1);
1050
1051 StoreU(packed0, d, packed_out + 0 * N);
1052 StoreU(packed1, d, packed_out + 1 * N);
1053 StoreU(packed2, d, packed_out + 2 * N);
1054 StoreU(packed3, d, packed_out + 3 * N);
1055 StoreU(packed4, d, packed_out + 4 * N);
1056 }
1057
1058 template <class D>
1059 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1060 uint16_t* HWY_RESTRICT raw) const {
1061 using VU16 = Vec<decltype(d)>;
1062 const size_t N = Lanes(d);
1063
1064 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
1065 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
1066 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
1067 const VU16 packed3 = LoadU(d, packed_in + 3 * N);
1068 const VU16 packed4 = LoadU(d, packed_in + 4 * N);
1069
1070 const VU16 mask = Set(d, 0x1Fu); // Lowest 5 bits
1071
1072 const VU16 raw0 = And(packed0, mask);
1073 StoreU(raw0, d, raw + 0 * N);
1074
1075 const VU16 raw1 = And(packed1, mask);
1076 StoreU(raw1, d, raw + 1 * N);
1077
1078 const VU16 raw2 = And(packed2, mask);
1079 StoreU(raw2, d, raw + 2 * N);
1080
1081 const VU16 raw3 = And(packed3, mask);
1082 StoreU(raw3, d, raw + 3 * N);
1083
1084 const VU16 raw4 = And(packed4, mask);
1085 StoreU(raw4, d, raw + 4 * N);
1086
1087 const VU16 raw5 = And(ShiftRight<5>(packed0), mask);
1088 StoreU(raw5, d, raw + 5 * N);
1089
1090 const VU16 raw6 = And(ShiftRight<5>(packed1), mask);
1091 StoreU(raw6, d, raw + 6 * N);
1092
1093 const VU16 raw7 = And(ShiftRight<5>(packed2), mask);
1094 StoreU(raw7, d, raw + 7 * N);
1095
1096 const VU16 raw8 = And(ShiftRight<5>(packed3), mask);
1097 StoreU(raw8, d, raw + 8 * N);
1098
1099 const VU16 raw9 = And(ShiftRight<5>(packed4), mask);
1100 StoreU(raw9, d, raw + 9 * N);
1101
1102 const VU16 rawA = And(ShiftRight<10>(packed0), mask);
1103 StoreU(rawA, d, raw + 0xA * N);
1104
1105 const VU16 rawB = And(ShiftRight<10>(packed1), mask);
1106 StoreU(rawB, d, raw + 0xB * N);
1107
1108 const VU16 rawC = And(ShiftRight<10>(packed2), mask);
1109 StoreU(rawC, d, raw + 0xC * N);
1110
1111 const VU16 rawD = And(ShiftRight<10>(packed3), mask);
1112 StoreU(rawD, d, raw + 0xD * N);
1113
1114 const VU16 rawE = And(ShiftRight<10>(packed4), mask);
1115 StoreU(rawE, d, raw + 0xE * N);
1116
1117 // rawF is the concatenation of the lower bit of packed0..4.
1118 const VU16 down0 = ShiftRight<15>(packed0);
1119 const VU16 down1 = ShiftRight<15>(packed1);
1120 const VU16 hi1 = Set(d, 0x8000u);
1121 const VU16 p0 =
1122 Xor3(ShiftRight<13>(And(packed2, hi1)), Add(down1, down1), down0);
1123 const VU16 rawF = Xor3(ShiftRight<11>(And(packed4, hi1)),
1124 ShiftRight<12>(And(packed3, hi1)), p0);
1125 StoreU(rawF, d, raw + 0xF * N);
1126 }
1127}; // Pack16<5>
1128
1129template <>
1130struct Pack16<6> {
1131 template <class D>
1132 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1133 uint16_t* HWY_RESTRICT packed_out) const {
1134 using VU16 = Vec<decltype(d)>;
1135 const size_t N = Lanes(d);
1136 const VU16 raw0 = LoadU(d, raw + 0 * N);
1137 const VU16 raw1 = LoadU(d, raw + 1 * N);
1138 const VU16 raw2 = LoadU(d, raw + 2 * N);
1139 const VU16 raw3 = LoadU(d, raw + 3 * N);
1140 const VU16 raw4 = LoadU(d, raw + 4 * N);
1141 const VU16 raw5 = LoadU(d, raw + 5 * N);
1142 const VU16 raw6 = LoadU(d, raw + 6 * N);
1143 const VU16 raw7 = LoadU(d, raw + 7 * N);
1144 const VU16 raw8 = LoadU(d, raw + 8 * N);
1145 const VU16 raw9 = LoadU(d, raw + 9 * N);
1146 const VU16 rawA = LoadU(d, raw + 0xA * N);
1147 const VU16 rawB = LoadU(d, raw + 0xB * N);
1148 const VU16 rawC = LoadU(d, raw + 0xC * N);
1149 const VU16 rawD = LoadU(d, raw + 0xD * N);
1150 const VU16 rawE = LoadU(d, raw + 0xE * N);
1151 const VU16 rawF = LoadU(d, raw + 0xF * N);
1152
1153 const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3);
1154 const VU16 packed7 = Or(ShiftLeft<6>(rawF), rawB);
1155 // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the
1156 // four remainder bits at the top of each vector.
1157 const VU16 packed0 = Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0);
1158 VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1);
1159 VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2);
1160 const VU16 packed4 = Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8);
1161 VU16 packed5 = Or(ShiftLeft<6>(rawD), raw9);
1162 VU16 packed6 = Or(ShiftLeft<6>(rawE), rawA);
1163
1164 const VU16 hi4 = Set(d, 0xF000u);
1165 packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4);
1166 packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4);
1167 packed5 = OrAnd(packed5, ShiftLeft<8>(packed7), hi4);
1168 packed6 = OrAnd(packed6, ShiftLeft<4>(packed7), hi4);
1169
1170 StoreU(packed0, d, packed_out + 0 * N);
1171 StoreU(packed1, d, packed_out + 1 * N);
1172 StoreU(packed2, d, packed_out + 2 * N);
1173 StoreU(packed4, d, packed_out + 3 * N);
1174 StoreU(packed5, d, packed_out + 4 * N);
1175 StoreU(packed6, d, packed_out + 5 * N);
1176 }
1177
1178 template <class D>
1179 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1180 uint16_t* HWY_RESTRICT raw) const {
1181 using VU16 = Vec<decltype(d)>;
1182 const size_t N = Lanes(d);
1183 const VU16 mask = Set(d, 0x3Fu); // Lowest 6 bits
1184
1185 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
1186 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
1187 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
1188 const VU16 packed4 = LoadU(d, packed_in + 3 * N);
1189 const VU16 packed5 = LoadU(d, packed_in + 4 * N);
1190 const VU16 packed6 = LoadU(d, packed_in + 5 * N);
1191
1192 const VU16 raw0 = And(packed0, mask);
1193 StoreU(raw0, d, raw + 0 * N);
1194
1195 const VU16 raw1 = And(packed1, mask);
1196 StoreU(raw1, d, raw + 1 * N);
1197
1198 const VU16 raw2 = And(packed2, mask);
1199 StoreU(raw2, d, raw + 2 * N);
1200
1201 const VU16 raw4 = And(ShiftRight<6>(packed0), mask);
1202 StoreU(raw4, d, raw + 4 * N);
1203
1204 const VU16 raw5 = And(ShiftRight<6>(packed1), mask);
1205 StoreU(raw5, d, raw + 5 * N);
1206
1207 const VU16 raw6 = And(ShiftRight<6>(packed2), mask);
1208 StoreU(raw6, d, raw + 6 * N);
1209
1210 const VU16 raw8 = And(packed4, mask);
1211 StoreU(raw8, d, raw + 8 * N);
1212
1213 const VU16 raw9 = And(packed5, mask);
1214 StoreU(raw9, d, raw + 9 * N);
1215
1216 const VU16 rawA = And(packed6, mask);
1217 StoreU(rawA, d, raw + 0xA * N);
1218
1219 const VU16 rawC = And(ShiftRight<6>(packed4), mask);
1220 StoreU(rawC, d, raw + 0xC * N);
1221
1222 const VU16 rawD = And(ShiftRight<6>(packed5), mask);
1223 StoreU(rawD, d, raw + 0xD * N);
1224
1225 const VU16 rawE = And(ShiftRight<6>(packed6), mask);
1226 StoreU(rawE, d, raw + 0xE * N);
1227
1228 // packed3 is the concatenation of the four upper bits in packed0..2.
1229 const VU16 down0 = ShiftRight<12>(packed0);
1230 const VU16 down4 = ShiftRight<12>(packed4);
1231 const VU16 hi4 = Set(d, 0xF000u);
1232 const VU16 packed3 = Xor3(ShiftRight<4>(And(packed2, hi4)),
1233 ShiftRight<8>(And(packed1, hi4)), down0);
1234 const VU16 packed7 = Xor3(ShiftRight<4>(And(packed6, hi4)),
1235 ShiftRight<8>(And(packed5, hi4)), down4);
1236 const VU16 raw3 = And(packed3, mask);
1237 StoreU(raw3, d, raw + 3 * N);
1238
1239 const VU16 rawB = And(packed7, mask);
1240 StoreU(rawB, d, raw + 0xB * N);
1241
1242 const VU16 raw7 = ShiftRight<6>(packed3); // upper bits already zero
1243 StoreU(raw7, d, raw + 7 * N);
1244
1245 const VU16 rawF = ShiftRight<6>(packed7); // upper bits already zero
1246 StoreU(rawF, d, raw + 0xF * N);
1247 }
1248}; // Pack16<6>
1249
1250template <>
1251struct Pack16<7> {
1252 template <class D>
1253 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1254 uint16_t* HWY_RESTRICT packed_out) const {
1255 using VU16 = Vec<decltype(d)>;
1256 const size_t N = Lanes(d);
1257 const VU16 raw0 = LoadU(d, raw + 0 * N);
1258 const VU16 raw1 = LoadU(d, raw + 1 * N);
1259 const VU16 raw2 = LoadU(d, raw + 2 * N);
1260 const VU16 raw3 = LoadU(d, raw + 3 * N);
1261 const VU16 raw4 = LoadU(d, raw + 4 * N);
1262 const VU16 raw5 = LoadU(d, raw + 5 * N);
1263 const VU16 raw6 = LoadU(d, raw + 6 * N);
1264 const VU16 raw7 = LoadU(d, raw + 7 * N);
1265 const VU16 raw8 = LoadU(d, raw + 8 * N);
1266 const VU16 raw9 = LoadU(d, raw + 9 * N);
1267 const VU16 rawA = LoadU(d, raw + 0xA * N);
1268 const VU16 rawB = LoadU(d, raw + 0xB * N);
1269 const VU16 rawC = LoadU(d, raw + 0xC * N);
1270 const VU16 rawD = LoadU(d, raw + 0xD * N);
1271 const VU16 rawE = LoadU(d, raw + 0xE * N);
1272 const VU16 rawF = LoadU(d, raw + 0xF * N);
1273
1274 const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7);
1275 // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the
1276 // two remainder bits at the top of each vector.
1277 const VU16 packed0 = Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0);
1278 VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1);
1279 VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2);
1280 VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3);
1281 VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4);
1282 VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5);
1283 VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6);
1284
1285 const VU16 hi2 = Set(d, 0xC000u);
1286 packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2);
1287 packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2);
1288 packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2);
1289 packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2);
1290 packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2);
1291 packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2);
1292
1293 StoreU(packed0, d, packed_out + 0 * N);
1294 StoreU(packed1, d, packed_out + 1 * N);
1295 StoreU(packed2, d, packed_out + 2 * N);
1296 StoreU(packed3, d, packed_out + 3 * N);
1297 StoreU(packed4, d, packed_out + 4 * N);
1298 StoreU(packed5, d, packed_out + 5 * N);
1299 StoreU(packed6, d, packed_out + 6 * N);
1300 }
1301
1302 template <class D>
1303 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1304 uint16_t* HWY_RESTRICT raw) const {
1305 using VU16 = Vec<decltype(d)>;
1306 const size_t N = Lanes(d);
1307
1308 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1309 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1310 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1311 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1312 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1313 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1314 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1315
1316 const VU16 mask = Set(d, 0x7Fu); // Lowest 7 bits
1317
1318 const VU16 raw0 = And(packed0, mask);
1319 StoreU(raw0, d, raw + 0 * N);
1320
1321 const VU16 raw1 = And(packed1, mask);
1322 StoreU(raw1, d, raw + 1 * N);
1323
1324 const VU16 raw2 = And(packed2, mask);
1325 StoreU(raw2, d, raw + 2 * N);
1326
1327 const VU16 raw3 = And(packed3, mask);
1328 StoreU(raw3, d, raw + 3 * N);
1329
1330 const VU16 raw4 = And(packed4, mask);
1331 StoreU(raw4, d, raw + 4 * N);
1332
1333 const VU16 raw5 = And(packed5, mask);
1334 StoreU(raw5, d, raw + 5 * N);
1335
1336 const VU16 raw6 = And(packed6, mask);
1337 StoreU(raw6, d, raw + 6 * N);
1338
1339 const VU16 raw8 = And(ShiftRight<7>(packed0), mask);
1340 StoreU(raw8, d, raw + 8 * N);
1341
1342 const VU16 raw9 = And(ShiftRight<7>(packed1), mask);
1343 StoreU(raw9, d, raw + 9 * N);
1344
1345 const VU16 rawA = And(ShiftRight<7>(packed2), mask);
1346 StoreU(rawA, d, raw + 0xA * N);
1347
1348 const VU16 rawB = And(ShiftRight<7>(packed3), mask);
1349 StoreU(rawB, d, raw + 0xB * N);
1350
1351 const VU16 rawC = And(ShiftRight<7>(packed4), mask);
1352 StoreU(rawC, d, raw + 0xC * N);
1353
1354 const VU16 rawD = And(ShiftRight<7>(packed5), mask);
1355 StoreU(rawD, d, raw + 0xD * N);
1356
1357 const VU16 rawE = And(ShiftRight<7>(packed6), mask);
1358 StoreU(rawE, d, raw + 0xE * N);
1359
1360 // packed7 is the concatenation of the two upper bits in packed0..6.
1361 const VU16 down0 = ShiftRight<14>(packed0);
1362 const VU16 hi2 = Set(d, 0xC000u);
1363 const VU16 p0 = Xor3(ShiftRight<12>(And(packed1, hi2)),
1364 ShiftRight<10>(And(packed2, hi2)), down0);
1365 const VU16 p1 = Xor3(ShiftRight<8>(And(packed3, hi2)), //
1366 ShiftRight<6>(And(packed4, hi2)),
1367 ShiftRight<4>(And(packed5, hi2)));
1368 const VU16 packed7 = Xor3(ShiftRight<2>(And(packed6, hi2)), p1, p0);
1369
1370 const VU16 raw7 = And(packed7, mask);
1371 StoreU(raw7, d, raw + 7 * N);
1372
1373 const VU16 rawF = ShiftRight<7>(packed7); // upper bits already zero
1374 StoreU(rawF, d, raw + 0xF * N);
1375 }
1376}; // Pack16<7>
1377
1378template <>
1379struct Pack16<8> {
1380 template <class D>
1381 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1382 uint16_t* HWY_RESTRICT packed_out) const {
1383 using VU16 = Vec<decltype(d)>;
1384 const size_t N = Lanes(d);
1385 const VU16 raw0 = LoadU(d, raw + 0 * N);
1386 const VU16 raw1 = LoadU(d, raw + 1 * N);
1387 const VU16 raw2 = LoadU(d, raw + 2 * N);
1388 const VU16 raw3 = LoadU(d, raw + 3 * N);
1389 const VU16 raw4 = LoadU(d, raw + 4 * N);
1390 const VU16 raw5 = LoadU(d, raw + 5 * N);
1391 const VU16 raw6 = LoadU(d, raw + 6 * N);
1392 const VU16 raw7 = LoadU(d, raw + 7 * N);
1393 const VU16 raw8 = LoadU(d, raw + 8 * N);
1394 const VU16 raw9 = LoadU(d, raw + 9 * N);
1395 const VU16 rawA = LoadU(d, raw + 0xA * N);
1396 const VU16 rawB = LoadU(d, raw + 0xB * N);
1397 const VU16 rawC = LoadU(d, raw + 0xC * N);
1398 const VU16 rawD = LoadU(d, raw + 0xD * N);
1399 const VU16 rawE = LoadU(d, raw + 0xE * N);
1400 const VU16 rawF = LoadU(d, raw + 0xF * N);
1401
1402 // This is equivalent to ConcatEven with 8-bit lanes, but much more
1403 // efficient on RVV and slightly less efficient on SVE2.
1404 const VU16 packed0 = Or(ShiftLeft<8>(raw2), raw0);
1405 const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw1);
1406 const VU16 packed2 = Or(ShiftLeft<8>(raw6), raw4);
1407 const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw5);
1408 const VU16 packed4 = Or(ShiftLeft<8>(rawA), raw8);
1409 const VU16 packed5 = Or(ShiftLeft<8>(rawB), raw9);
1410 const VU16 packed6 = Or(ShiftLeft<8>(rawE), rawC);
1411 const VU16 packed7 = Or(ShiftLeft<8>(rawF), rawD);
1412
1413 StoreU(packed0, d, packed_out + 0 * N);
1414 StoreU(packed1, d, packed_out + 1 * N);
1415 StoreU(packed2, d, packed_out + 2 * N);
1416 StoreU(packed3, d, packed_out + 3 * N);
1417 StoreU(packed4, d, packed_out + 4 * N);
1418 StoreU(packed5, d, packed_out + 5 * N);
1419 StoreU(packed6, d, packed_out + 6 * N);
1420 StoreU(packed7, d, packed_out + 7 * N);
1421 }
1422
1423 template <class D>
1424 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1425 uint16_t* HWY_RESTRICT raw) const {
1426 using VU16 = Vec<decltype(d)>;
1427 const size_t N = Lanes(d);
1428
1429 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1430 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1431 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1432 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1433 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1434 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1435 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1436 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1437 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits
1438
1439 const VU16 raw0 = And(packed0, mask);
1440 StoreU(raw0, d, raw + 0 * N);
1441
1442 const VU16 raw1 = And(packed1, mask);
1443 StoreU(raw1, d, raw + 1 * N);
1444
1445 const VU16 raw2 = ShiftRight<8>(packed0); // upper bits already zero
1446 StoreU(raw2, d, raw + 2 * N);
1447
1448 const VU16 raw3 = ShiftRight<8>(packed1); // upper bits already zero
1449 StoreU(raw3, d, raw + 3 * N);
1450
1451 const VU16 raw4 = And(packed2, mask);
1452 StoreU(raw4, d, raw + 4 * N);
1453
1454 const VU16 raw5 = And(packed3, mask);
1455 StoreU(raw5, d, raw + 5 * N);
1456
1457 const VU16 raw6 = ShiftRight<8>(packed2); // upper bits already zero
1458 StoreU(raw6, d, raw + 6 * N);
1459
1460 const VU16 raw7 = ShiftRight<8>(packed3); // upper bits already zero
1461 StoreU(raw7, d, raw + 7 * N);
1462
1463 const VU16 raw8 = And(packed4, mask);
1464 StoreU(raw8, d, raw + 8 * N);
1465
1466 const VU16 raw9 = And(packed5, mask);
1467 StoreU(raw9, d, raw + 9 * N);
1468
1469 const VU16 rawA = ShiftRight<8>(packed4); // upper bits already zero
1470 StoreU(rawA, d, raw + 0xA * N);
1471
1472 const VU16 rawB = ShiftRight<8>(packed5); // upper bits already zero
1473 StoreU(rawB, d, raw + 0xB * N);
1474
1475 const VU16 rawC = And(packed6, mask);
1476 StoreU(rawC, d, raw + 0xC * N);
1477
1478 const VU16 rawD = And(packed7, mask);
1479 StoreU(rawD, d, raw + 0xD * N);
1480
1481 const VU16 rawE = ShiftRight<8>(packed6); // upper bits already zero
1482 StoreU(rawE, d, raw + 0xE * N);
1483
1484 const VU16 rawF = ShiftRight<8>(packed7); // upper bits already zero
1485 StoreU(rawF, d, raw + 0xF * N);
1486 }
1487}; // Pack16<8>
1488
1489template <>
1490struct Pack16<9> {
1491 template <class D>
1492 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1493 uint16_t* HWY_RESTRICT packed_out) const {
1494 using VU16 = Vec<decltype(d)>;
1495 const size_t N = Lanes(d);
1496 const VU16 raw0 = LoadU(d, raw + 0 * N);
1497 const VU16 raw1 = LoadU(d, raw + 1 * N);
1498 const VU16 raw2 = LoadU(d, raw + 2 * N);
1499 const VU16 raw3 = LoadU(d, raw + 3 * N);
1500 const VU16 raw4 = LoadU(d, raw + 4 * N);
1501 const VU16 raw5 = LoadU(d, raw + 5 * N);
1502 const VU16 raw6 = LoadU(d, raw + 6 * N);
1503 const VU16 raw7 = LoadU(d, raw + 7 * N);
1504 const VU16 raw8 = LoadU(d, raw + 8 * N);
1505 const VU16 raw9 = LoadU(d, raw + 9 * N);
1506 const VU16 rawA = LoadU(d, raw + 0xA * N);
1507 const VU16 rawB = LoadU(d, raw + 0xB * N);
1508 const VU16 rawC = LoadU(d, raw + 0xC * N);
1509 const VU16 rawD = LoadU(d, raw + 0xD * N);
1510 const VU16 rawE = LoadU(d, raw + 0xE * N);
1511 const VU16 rawF = LoadU(d, raw + 0xF * N);
1512 // 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8.
1513 const VU16 packed0 = Or(ShiftLeft<9>(raw8), raw0);
1514 const VU16 packed1 = Or(ShiftLeft<9>(raw9), raw1);
1515 const VU16 packed2 = Or(ShiftLeft<9>(rawA), raw2);
1516 const VU16 packed3 = Or(ShiftLeft<9>(rawB), raw3);
1517 const VU16 packed4 = Or(ShiftLeft<9>(rawC), raw4);
1518 const VU16 packed5 = Or(ShiftLeft<9>(rawD), raw5);
1519 const VU16 packed6 = Or(ShiftLeft<9>(rawE), raw6);
1520 const VU16 packed7 = Or(ShiftLeft<9>(rawF), raw7);
1521
1522 // We could shift down, OR and shift up, but two shifts are typically more
1523 // expensive than AND, shift into position, and OR (which can be further
1524 // reduced via Xor3).
1525 const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9
1526 const VU16 part8 = ShiftRight<7>(And(raw8, mid2));
1527 const VU16 part9 = ShiftRight<5>(And(raw9, mid2));
1528 const VU16 partA = ShiftRight<3>(And(rawA, mid2));
1529 const VU16 partB = ShiftRight<1>(And(rawB, mid2));
1530 const VU16 partC = ShiftLeft<1>(And(rawC, mid2));
1531 const VU16 partD = ShiftLeft<3>(And(rawD, mid2));
1532 const VU16 partE = ShiftLeft<5>(And(rawE, mid2));
1533 const VU16 partF = ShiftLeft<7>(And(rawF, mid2));
1534 const VU16 packed8 = Xor3(Xor3(part8, part9, partA),
1535 Xor3(partB, partC, partD), Or(partE, partF));
1536
1537 StoreU(packed0, d, packed_out + 0 * N);
1538 StoreU(packed1, d, packed_out + 1 * N);
1539 StoreU(packed2, d, packed_out + 2 * N);
1540 StoreU(packed3, d, packed_out + 3 * N);
1541 StoreU(packed4, d, packed_out + 4 * N);
1542 StoreU(packed5, d, packed_out + 5 * N);
1543 StoreU(packed6, d, packed_out + 6 * N);
1544 StoreU(packed7, d, packed_out + 7 * N);
1545 StoreU(packed8, d, packed_out + 8 * N);
1546 }
1547
1548 template <class D>
1549 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1550 uint16_t* HWY_RESTRICT raw) const {
1551 using VU16 = Vec<decltype(d)>;
1552 const size_t N = Lanes(d);
1553
1554 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1555 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1556 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1557 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1558 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1559 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1560 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1561 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1562 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1563
1564 const VU16 mask = Set(d, 0x1FFu); // Lowest 9 bits
1565
1566 const VU16 raw0 = And(packed0, mask);
1567 StoreU(raw0, d, raw + 0 * N);
1568
1569 const VU16 raw1 = And(packed1, mask);
1570 StoreU(raw1, d, raw + 1 * N);
1571
1572 const VU16 raw2 = And(packed2, mask);
1573 StoreU(raw2, d, raw + 2 * N);
1574
1575 const VU16 raw3 = And(packed3, mask);
1576 StoreU(raw3, d, raw + 3 * N);
1577
1578 const VU16 raw4 = And(packed4, mask);
1579 StoreU(raw4, d, raw + 4 * N);
1580
1581 const VU16 raw5 = And(packed5, mask);
1582 StoreU(raw5, d, raw + 5 * N);
1583
1584 const VU16 raw6 = And(packed6, mask);
1585 StoreU(raw6, d, raw + 6 * N);
1586
1587 const VU16 raw7 = And(packed7, mask);
1588 StoreU(raw7, d, raw + 7 * N);
1589
1590 const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9
1591 const VU16 raw8 =
1592 OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2);
1593 const VU16 raw9 =
1594 OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2);
1595 const VU16 rawA =
1596 OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2);
1597 const VU16 rawB =
1598 OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2);
1599 const VU16 rawC =
1600 OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2);
1601 const VU16 rawD =
1602 OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2);
1603 const VU16 rawE =
1604 OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2);
1605 const VU16 rawF =
1606 OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2);
1607
1608 StoreU(raw8, d, raw + 8 * N);
1609 StoreU(raw9, d, raw + 9 * N);
1610 StoreU(rawA, d, raw + 0xA * N);
1611 StoreU(rawB, d, raw + 0xB * N);
1612 StoreU(rawC, d, raw + 0xC * N);
1613 StoreU(rawD, d, raw + 0xD * N);
1614 StoreU(rawE, d, raw + 0xE * N);
1615 StoreU(rawF, d, raw + 0xF * N);
1616 }
1617}; // Pack16<9>
1618
1619template <>
1620struct Pack16<10> {
1621 template <class D>
1622 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1623 uint16_t* HWY_RESTRICT packed_out) const {
1624 using VU16 = Vec<decltype(d)>;
1625 const size_t N = Lanes(d);
1626 const VU16 raw0 = LoadU(d, raw + 0 * N);
1627 const VU16 raw1 = LoadU(d, raw + 1 * N);
1628 const VU16 raw2 = LoadU(d, raw + 2 * N);
1629 const VU16 raw3 = LoadU(d, raw + 3 * N);
1630 const VU16 raw4 = LoadU(d, raw + 4 * N);
1631 const VU16 raw5 = LoadU(d, raw + 5 * N);
1632 const VU16 raw6 = LoadU(d, raw + 6 * N);
1633 const VU16 raw7 = LoadU(d, raw + 7 * N);
1634 const VU16 raw8 = LoadU(d, raw + 8 * N);
1635 const VU16 raw9 = LoadU(d, raw + 9 * N);
1636 const VU16 rawA = LoadU(d, raw + 0xA * N);
1637 const VU16 rawB = LoadU(d, raw + 0xB * N);
1638 const VU16 rawC = LoadU(d, raw + 0xC * N);
1639 const VU16 rawD = LoadU(d, raw + 0xD * N);
1640 const VU16 rawE = LoadU(d, raw + 0xE * N);
1641 const VU16 rawF = LoadU(d, raw + 0xF * N);
1642
1643 // 8 vectors, each with 10+6 bits; top 4 bits are concatenated into
1644 // packed8 and packed9.
1645 const VU16 packed0 = Or(ShiftLeft<10>(raw8), raw0);
1646 const VU16 packed1 = Or(ShiftLeft<10>(raw9), raw1);
1647 const VU16 packed2 = Or(ShiftLeft<10>(rawA), raw2);
1648 const VU16 packed3 = Or(ShiftLeft<10>(rawB), raw3);
1649 const VU16 packed4 = Or(ShiftLeft<10>(rawC), raw4);
1650 const VU16 packed5 = Or(ShiftLeft<10>(rawD), raw5);
1651 const VU16 packed6 = Or(ShiftLeft<10>(rawE), raw6);
1652 const VU16 packed7 = Or(ShiftLeft<10>(rawF), raw7);
1653
1654 // We could shift down, OR and shift up, but two shifts are typically more
1655 // expensive than AND, shift into position, and OR (which can be further
1656 // reduced via Xor3).
1657 const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10
1658 const VU16 part8 = ShiftRight<6>(And(raw8, mid4));
1659 const VU16 part9 = ShiftRight<2>(And(raw9, mid4));
1660 const VU16 partA = ShiftLeft<2>(And(rawA, mid4));
1661 const VU16 partB = ShiftLeft<6>(And(rawB, mid4));
1662 const VU16 partC = ShiftRight<6>(And(rawC, mid4));
1663 const VU16 partD = ShiftRight<2>(And(rawD, mid4));
1664 const VU16 partE = ShiftLeft<2>(And(rawE, mid4));
1665 const VU16 partF = ShiftLeft<6>(And(rawF, mid4));
1666 const VU16 packed8 = Or(Xor3(part8, part9, partA), partB);
1667 const VU16 packed9 = Or(Xor3(partC, partD, partE), partF);
1668
1669 StoreU(packed0, d, packed_out + 0 * N);
1670 StoreU(packed1, d, packed_out + 1 * N);
1671 StoreU(packed2, d, packed_out + 2 * N);
1672 StoreU(packed3, d, packed_out + 3 * N);
1673 StoreU(packed4, d, packed_out + 4 * N);
1674 StoreU(packed5, d, packed_out + 5 * N);
1675 StoreU(packed6, d, packed_out + 6 * N);
1676 StoreU(packed7, d, packed_out + 7 * N);
1677 StoreU(packed8, d, packed_out + 8 * N);
1678 StoreU(packed9, d, packed_out + 9 * N);
1679 }
1680
1681 template <class D>
1682 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1683 uint16_t* HWY_RESTRICT raw) const {
1684 using VU16 = Vec<decltype(d)>;
1685 const size_t N = Lanes(d);
1686
1687 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1688 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1689 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1690 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1691 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1692 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1693 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1694 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1695 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1696 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
1697
1698 const VU16 mask = Set(d, 0x3FFu); // Lowest 10 bits
1699
1700 const VU16 raw0 = And(packed0, mask);
1701 StoreU(raw0, d, raw + 0 * N);
1702
1703 const VU16 raw1 = And(packed1, mask);
1704 StoreU(raw1, d, raw + 1 * N);
1705
1706 const VU16 raw2 = And(packed2, mask);
1707 StoreU(raw2, d, raw + 2 * N);
1708
1709 const VU16 raw3 = And(packed3, mask);
1710 StoreU(raw3, d, raw + 3 * N);
1711
1712 const VU16 raw4 = And(packed4, mask);
1713 StoreU(raw4, d, raw + 4 * N);
1714
1715 const VU16 raw5 = And(packed5, mask);
1716 StoreU(raw5, d, raw + 5 * N);
1717
1718 const VU16 raw6 = And(packed6, mask);
1719 StoreU(raw6, d, raw + 6 * N);
1720
1721 const VU16 raw7 = And(packed7, mask);
1722 StoreU(raw7, d, raw + 7 * N);
1723
1724 const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10
1725 const VU16 raw8 =
1726 OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4);
1727 const VU16 raw9 =
1728 OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4);
1729 const VU16 rawA =
1730 OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4);
1731 const VU16 rawB =
1732 OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4);
1733 const VU16 rawC =
1734 OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4);
1735 const VU16 rawD =
1736 OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4);
1737 const VU16 rawE =
1738 OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4);
1739 const VU16 rawF =
1740 OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4);
1741
1742 StoreU(raw8, d, raw + 8 * N);
1743 StoreU(raw9, d, raw + 9 * N);
1744 StoreU(rawA, d, raw + 0xA * N);
1745 StoreU(rawB, d, raw + 0xB * N);
1746 StoreU(rawC, d, raw + 0xC * N);
1747 StoreU(rawD, d, raw + 0xD * N);
1748 StoreU(rawE, d, raw + 0xE * N);
1749 StoreU(rawF, d, raw + 0xF * N);
1750 }
1751}; // Pack16<10>
1752
1753template <>
1754struct Pack16<11> {
1755 template <class D>
1756 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1757 uint16_t* HWY_RESTRICT packed_out) const {
1758 using VU16 = Vec<decltype(d)>;
1759 const size_t N = Lanes(d);
1760 const VU16 raw0 = LoadU(d, raw + 0 * N);
1761 const VU16 raw1 = LoadU(d, raw + 1 * N);
1762 const VU16 raw2 = LoadU(d, raw + 2 * N);
1763 const VU16 raw3 = LoadU(d, raw + 3 * N);
1764 const VU16 raw4 = LoadU(d, raw + 4 * N);
1765 const VU16 raw5 = LoadU(d, raw + 5 * N);
1766 const VU16 raw6 = LoadU(d, raw + 6 * N);
1767 const VU16 raw7 = LoadU(d, raw + 7 * N);
1768 const VU16 raw8 = LoadU(d, raw + 8 * N);
1769 const VU16 raw9 = LoadU(d, raw + 9 * N);
1770 const VU16 rawA = LoadU(d, raw + 0xA * N);
1771 const VU16 rawB = LoadU(d, raw + 0xB * N);
1772 const VU16 rawC = LoadU(d, raw + 0xC * N);
1773 const VU16 rawD = LoadU(d, raw + 0xD * N);
1774 const VU16 rawE = LoadU(d, raw + 0xE * N);
1775 const VU16 rawF = LoadU(d, raw + 0xF * N);
1776
1777 // It is not obvious what the optimal partitioning looks like. To reduce the
1778 // number of constants, we want to minimize the number of distinct bit
1779 // lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers.
1780 // 8+3 seems better: it is easier to scatter 3 bits into the MSBs.
1781 const VU16 lo8 = Set(d, 0xFFu);
1782
1783 // Lower 8 bits of all raw
1784 const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
1785 const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
1786 const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
1787 const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
1788 const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
1789 const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
1790 const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
1791 const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
1792
1793 StoreU(packed0, d, packed_out + 0 * N);
1794 StoreU(packed1, d, packed_out + 1 * N);
1795 StoreU(packed2, d, packed_out + 2 * N);
1796 StoreU(packed3, d, packed_out + 3 * N);
1797 StoreU(packed4, d, packed_out + 4 * N);
1798 StoreU(packed5, d, packed_out + 5 * N);
1799 StoreU(packed6, d, packed_out + 6 * N);
1800 StoreU(packed7, d, packed_out + 7 * N);
1801
1802 // Three vectors, five 3bit remnants each, plus one 3bit in their MSB.
1803 const VU16 top0 = ShiftRight<8>(raw0);
1804 const VU16 top1 = ShiftRight<8>(raw1);
1805 const VU16 top2 = ShiftRight<8>(raw2);
1806 // Insert top raw bits into 3-bit groups within packed8..A. Moving the
1807 // mask along avoids masking each of raw0..E and enables OrAnd.
1808 VU16 next = Set(d, 0x38u); // 0x7 << 3
1809 VU16 packed8 = OrAnd(top0, ShiftRight<5>(raw3), next);
1810 VU16 packed9 = OrAnd(top1, ShiftRight<5>(raw4), next);
1811 VU16 packedA = OrAnd(top2, ShiftRight<5>(raw5), next);
1812 next = ShiftLeft<3>(next);
1813 packed8 = OrAnd(packed8, ShiftRight<2>(raw6), next);
1814 packed9 = OrAnd(packed9, ShiftRight<2>(raw7), next);
1815 packedA = OrAnd(packedA, ShiftRight<2>(raw8), next);
1816 next = ShiftLeft<3>(next);
1817 packed8 = OrAnd(packed8, Add(raw9, raw9), next);
1818 packed9 = OrAnd(packed9, Add(rawA, rawA), next);
1819 packedA = OrAnd(packedA, Add(rawB, rawB), next);
1820 next = ShiftLeft<3>(next);
1821 packed8 = OrAnd(packed8, ShiftLeft<4>(rawC), next);
1822 packed9 = OrAnd(packed9, ShiftLeft<4>(rawD), next);
1823 packedA = OrAnd(packedA, ShiftLeft<4>(rawE), next);
1824
1825 // Scatter upper 3 bits of rawF into the upper bits.
1826 next = ShiftLeft<3>(next); // = 0x8000u
1827 packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
1828 packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
1829 packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
1830
1831 StoreU(packed8, d, packed_out + 8 * N);
1832 StoreU(packed9, d, packed_out + 9 * N);
1833 StoreU(packedA, d, packed_out + 0xA * N);
1834 }
1835
1836 template <class D>
1837 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1838 uint16_t* HWY_RESTRICT raw) const {
1839 using VU16 = Vec<decltype(d)>;
1840 const size_t N = Lanes(d);
1841
1842 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1843 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1844 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1845 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1846 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1847 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1848 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1849 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1850 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1851 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
1852 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
1853
1854 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits
1855
1856 const VU16 down0 = And(packed0, mask);
1857 const VU16 down1 = ShiftRight<8>(packed0);
1858 const VU16 down2 = And(packed1, mask);
1859 const VU16 down3 = ShiftRight<8>(packed1);
1860 const VU16 down4 = And(packed2, mask);
1861 const VU16 down5 = ShiftRight<8>(packed2);
1862 const VU16 down6 = And(packed3, mask);
1863 const VU16 down7 = ShiftRight<8>(packed3);
1864 const VU16 down8 = And(packed4, mask);
1865 const VU16 down9 = ShiftRight<8>(packed4);
1866 const VU16 downA = And(packed5, mask);
1867 const VU16 downB = ShiftRight<8>(packed5);
1868 const VU16 downC = And(packed6, mask);
1869 const VU16 downD = ShiftRight<8>(packed6);
1870 const VU16 downE = And(packed7, mask);
1871 const VU16 downF = ShiftRight<8>(packed7);
1872
1873 // Three bits from packed8..A, eight bits from down0..F.
1874 const VU16 hi3 = Set(d, 0x700u);
1875 const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi3);
1876 const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi3);
1877 const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi3);
1878
1879 const VU16 raw3 = OrAnd(down3, ShiftLeft<5>(packed8), hi3);
1880 const VU16 raw4 = OrAnd(down4, ShiftLeft<5>(packed9), hi3);
1881 const VU16 raw5 = OrAnd(down5, ShiftLeft<5>(packedA), hi3);
1882
1883 const VU16 raw6 = OrAnd(down6, ShiftLeft<2>(packed8), hi3);
1884 const VU16 raw7 = OrAnd(down7, ShiftLeft<2>(packed9), hi3);
1885 const VU16 raw8 = OrAnd(down8, ShiftLeft<2>(packedA), hi3);
1886
1887 const VU16 raw9 = OrAnd(down9, ShiftRight<1>(packed8), hi3);
1888 const VU16 rawA = OrAnd(downA, ShiftRight<1>(packed9), hi3);
1889 const VU16 rawB = OrAnd(downB, ShiftRight<1>(packedA), hi3);
1890
1891 const VU16 rawC = OrAnd(downC, ShiftRight<4>(packed8), hi3);
1892 const VU16 rawD = OrAnd(downD, ShiftRight<4>(packed9), hi3);
1893 const VU16 rawE = OrAnd(downE, ShiftRight<4>(packedA), hi3);
1894
1895 // Shift MSB into the top 3-of-11 and mask.
1896 const VU16 rawF = Or(downF, Xor3(And(ShiftRight<7>(packed8), hi3),
1897 And(ShiftRight<6>(packed9), hi3),
1898 And(ShiftRight<5>(packedA), hi3)));
1899
1900 StoreU(raw0, d, raw + 0 * N);
1901 StoreU(raw1, d, raw + 1 * N);
1902 StoreU(raw2, d, raw + 2 * N);
1903 StoreU(raw3, d, raw + 3 * N);
1904 StoreU(raw4, d, raw + 4 * N);
1905 StoreU(raw5, d, raw + 5 * N);
1906 StoreU(raw6, d, raw + 6 * N);
1907 StoreU(raw7, d, raw + 7 * N);
1908 StoreU(raw8, d, raw + 8 * N);
1909 StoreU(raw9, d, raw + 9 * N);
1910 StoreU(rawA, d, raw + 0xA * N);
1911 StoreU(rawB, d, raw + 0xB * N);
1912 StoreU(rawC, d, raw + 0xC * N);
1913 StoreU(rawD, d, raw + 0xD * N);
1914 StoreU(rawE, d, raw + 0xE * N);
1915 StoreU(rawF, d, raw + 0xF * N);
1916 }
1917}; // Pack16<11>
1918
1919template <>
1920struct Pack16<12> {
1921 template <class D>
1922 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1923 uint16_t* HWY_RESTRICT packed_out) const {
1924 using VU16 = Vec<decltype(d)>;
1925 const size_t N = Lanes(d);
1926 const VU16 raw0 = LoadU(d, raw + 0 * N);
1927 const VU16 raw1 = LoadU(d, raw + 1 * N);
1928 const VU16 raw2 = LoadU(d, raw + 2 * N);
1929 const VU16 raw3 = LoadU(d, raw + 3 * N);
1930 const VU16 raw4 = LoadU(d, raw + 4 * N);
1931 const VU16 raw5 = LoadU(d, raw + 5 * N);
1932 const VU16 raw6 = LoadU(d, raw + 6 * N);
1933 const VU16 raw7 = LoadU(d, raw + 7 * N);
1934 const VU16 raw8 = LoadU(d, raw + 8 * N);
1935 const VU16 raw9 = LoadU(d, raw + 9 * N);
1936 const VU16 rawA = LoadU(d, raw + 0xA * N);
1937 const VU16 rawB = LoadU(d, raw + 0xB * N);
1938 const VU16 rawC = LoadU(d, raw + 0xC * N);
1939 const VU16 rawD = LoadU(d, raw + 0xD * N);
1940 const VU16 rawE = LoadU(d, raw + 0xE * N);
1941 const VU16 rawF = LoadU(d, raw + 0xF * N);
1942
1943 // 8 vectors, each with 12+4 bits; top 8 bits are concatenated into
1944 // packed8 to packedB.
1945 const VU16 packed0 = Or(ShiftLeft<12>(raw8), raw0);
1946 const VU16 packed1 = Or(ShiftLeft<12>(raw9), raw1);
1947 const VU16 packed2 = Or(ShiftLeft<12>(rawA), raw2);
1948 const VU16 packed3 = Or(ShiftLeft<12>(rawB), raw3);
1949 const VU16 packed4 = Or(ShiftLeft<12>(rawC), raw4);
1950 const VU16 packed5 = Or(ShiftLeft<12>(rawD), raw5);
1951 const VU16 packed6 = Or(ShiftLeft<12>(rawE), raw6);
1952 const VU16 packed7 = Or(ShiftLeft<12>(rawF), raw7);
1953
1954 // Masking after shifting left enables OrAnd.
1955 const VU16 hi8 = Set(d, 0xFF00u);
1956 const VU16 packed8 = OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8);
1957 const VU16 packed9 = OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8);
1958 const VU16 packedA = OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8);
1959 const VU16 packedB = OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8);
1960 StoreU(packed0, d, packed_out + 0 * N);
1961 StoreU(packed1, d, packed_out + 1 * N);
1962 StoreU(packed2, d, packed_out + 2 * N);
1963 StoreU(packed3, d, packed_out + 3 * N);
1964 StoreU(packed4, d, packed_out + 4 * N);
1965 StoreU(packed5, d, packed_out + 5 * N);
1966 StoreU(packed6, d, packed_out + 6 * N);
1967 StoreU(packed7, d, packed_out + 7 * N);
1968 StoreU(packed8, d, packed_out + 8 * N);
1969 StoreU(packed9, d, packed_out + 9 * N);
1970 StoreU(packedA, d, packed_out + 0xA * N);
1971 StoreU(packedB, d, packed_out + 0xB * N);
1972 }
1973
1974 template <class D>
1975 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1976 uint16_t* HWY_RESTRICT raw) const {
1977 using VU16 = Vec<decltype(d)>;
1978 const size_t N = Lanes(d);
1979
1980 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1981 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1982 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1983 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1984 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1985 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1986 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1987 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1988 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1989 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
1990 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
1991 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
1992
1993 const VU16 mask = Set(d, 0xFFFu); // Lowest 12 bits
1994
1995 const VU16 raw0 = And(packed0, mask);
1996 StoreU(raw0, d, raw + 0 * N);
1997
1998 const VU16 raw1 = And(packed1, mask);
1999 StoreU(raw1, d, raw + 1 * N);
2000
2001 const VU16 raw2 = And(packed2, mask);
2002 StoreU(raw2, d, raw + 2 * N);
2003
2004 const VU16 raw3 = And(packed3, mask);
2005 StoreU(raw3, d, raw + 3 * N);
2006
2007 const VU16 raw4 = And(packed4, mask);
2008 StoreU(raw4, d, raw + 4 * N);
2009
2010 const VU16 raw5 = And(packed5, mask);
2011 StoreU(raw5, d, raw + 5 * N);
2012
2013 const VU16 raw6 = And(packed6, mask);
2014 StoreU(raw6, d, raw + 6 * N);
2015
2016 const VU16 raw7 = And(packed7, mask);
2017 StoreU(raw7, d, raw + 7 * N);
2018
2019 const VU16 mid8 = Set(d, 0xFF0u); // upper 8 in lower 12
2020 const VU16 raw8 =
2021 OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8);
2022 const VU16 raw9 =
2023 OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8);
2024 const VU16 rawA =
2025 OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8);
2026 const VU16 rawB =
2027 OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8);
2028 const VU16 rawC =
2029 OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8);
2030 const VU16 rawD =
2031 OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8);
2032 const VU16 rawE =
2033 OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8);
2034 const VU16 rawF =
2035 OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8);
2036 StoreU(raw8, d, raw + 8 * N);
2037 StoreU(raw9, d, raw + 9 * N);
2038 StoreU(rawA, d, raw + 0xA * N);
2039 StoreU(rawB, d, raw + 0xB * N);
2040 StoreU(rawC, d, raw + 0xC * N);
2041 StoreU(rawD, d, raw + 0xD * N);
2042 StoreU(rawE, d, raw + 0xE * N);
2043 StoreU(rawF, d, raw + 0xF * N);
2044 }
2045}; // Pack16<12>
2046
2047template <>
2048struct Pack16<13> {
2049 template <class D>
2050 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2051 uint16_t* HWY_RESTRICT packed_out) const {
2052 using VU16 = Vec<decltype(d)>;
2053 const size_t N = Lanes(d);
2054 const VU16 raw0 = LoadU(d, raw + 0 * N);
2055 const VU16 raw1 = LoadU(d, raw + 1 * N);
2056 const VU16 raw2 = LoadU(d, raw + 2 * N);
2057 const VU16 raw3 = LoadU(d, raw + 3 * N);
2058 const VU16 raw4 = LoadU(d, raw + 4 * N);
2059 const VU16 raw5 = LoadU(d, raw + 5 * N);
2060 const VU16 raw6 = LoadU(d, raw + 6 * N);
2061 const VU16 raw7 = LoadU(d, raw + 7 * N);
2062 const VU16 raw8 = LoadU(d, raw + 8 * N);
2063 const VU16 raw9 = LoadU(d, raw + 9 * N);
2064 const VU16 rawA = LoadU(d, raw + 0xA * N);
2065 const VU16 rawB = LoadU(d, raw + 0xB * N);
2066 const VU16 rawC = LoadU(d, raw + 0xC * N);
2067 const VU16 rawD = LoadU(d, raw + 0xD * N);
2068 const VU16 rawE = LoadU(d, raw + 0xE * N);
2069 const VU16 rawF = LoadU(d, raw + 0xF * N);
2070
2071 // As with 11 bits, it is not obvious what the optimal partitioning looks
2072 // like. We similarly go with an 8+5 split.
2073 const VU16 lo8 = Set(d, 0xFFu);
2074
2075 // Lower 8 bits of all raw
2076 const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
2077 const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
2078 const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
2079 const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
2080 const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
2081 const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
2082 const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
2083 const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
2084
2085 StoreU(packed0, d, packed_out + 0 * N);
2086 StoreU(packed1, d, packed_out + 1 * N);
2087 StoreU(packed2, d, packed_out + 2 * N);
2088 StoreU(packed3, d, packed_out + 3 * N);
2089 StoreU(packed4, d, packed_out + 4 * N);
2090 StoreU(packed5, d, packed_out + 5 * N);
2091 StoreU(packed6, d, packed_out + 6 * N);
2092 StoreU(packed7, d, packed_out + 7 * N);
2093
2094 // Five vectors, three 5bit remnants each, plus one 5bit in their MSB.
2095 const VU16 top0 = ShiftRight<8>(raw0);
2096 const VU16 top1 = ShiftRight<8>(raw1);
2097 const VU16 top2 = ShiftRight<8>(raw2);
2098 const VU16 top3 = ShiftRight<8>(raw3);
2099 const VU16 top4 = ShiftRight<8>(raw4);
2100
2101 // Insert top raw bits into 5-bit groups within packed8..C. Moving the
2102 // mask along avoids masking each of raw0..E and enables OrAnd.
2103 VU16 next = Set(d, 0x3E0u); // 0x1F << 5
2104 VU16 packed8 = OrAnd(top0, ShiftRight<3>(raw5), next);
2105 VU16 packed9 = OrAnd(top1, ShiftRight<3>(raw6), next);
2106 VU16 packedA = OrAnd(top2, ShiftRight<3>(raw7), next);
2107 VU16 packedB = OrAnd(top3, ShiftRight<3>(raw8), next);
2108 VU16 packedC = OrAnd(top4, ShiftRight<3>(raw9), next);
2109 next = ShiftLeft<5>(next);
2110 packed8 = OrAnd(packed8, ShiftLeft<2>(rawA), next);
2111 packed9 = OrAnd(packed9, ShiftLeft<2>(rawB), next);
2112 packedA = OrAnd(packedA, ShiftLeft<2>(rawC), next);
2113 packedB = OrAnd(packedB, ShiftLeft<2>(rawD), next);
2114 packedC = OrAnd(packedC, ShiftLeft<2>(rawE), next);
2115
2116 // Scatter upper 5 bits of rawF into the upper bits.
2117 next = ShiftLeft<3>(next); // = 0x8000u
2118 packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
2119 packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
2120 packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
2121 packedB = OrAnd(packedB, ShiftLeft<4>(rawF), next);
2122 packedC = OrAnd(packedC, ShiftLeft<3>(rawF), next);
2123
2124 StoreU(packed8, d, packed_out + 8 * N);
2125 StoreU(packed9, d, packed_out + 9 * N);
2126 StoreU(packedA, d, packed_out + 0xA * N);
2127 StoreU(packedB, d, packed_out + 0xB * N);
2128 StoreU(packedC, d, packed_out + 0xC * N);
2129 }
2130
2131 template <class D>
2132 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2133 uint16_t* HWY_RESTRICT raw) const {
2134 using VU16 = Vec<decltype(d)>;
2135 const size_t N = Lanes(d);
2136
2137 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2138 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2139 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2140 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2141 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2142 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2143 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2144 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2145 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2146 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2147 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2148 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2149 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2150
2151 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits
2152
2153 const VU16 down0 = And(packed0, mask);
2154 const VU16 down1 = ShiftRight<8>(packed0);
2155 const VU16 down2 = And(packed1, mask);
2156 const VU16 down3 = ShiftRight<8>(packed1);
2157 const VU16 down4 = And(packed2, mask);
2158 const VU16 down5 = ShiftRight<8>(packed2);
2159 const VU16 down6 = And(packed3, mask);
2160 const VU16 down7 = ShiftRight<8>(packed3);
2161 const VU16 down8 = And(packed4, mask);
2162 const VU16 down9 = ShiftRight<8>(packed4);
2163 const VU16 downA = And(packed5, mask);
2164 const VU16 downB = ShiftRight<8>(packed5);
2165 const VU16 downC = And(packed6, mask);
2166 const VU16 downD = ShiftRight<8>(packed6);
2167 const VU16 downE = And(packed7, mask);
2168 const VU16 downF = ShiftRight<8>(packed7);
2169
2170 // Upper five bits from packed8..C, eight bits from down0..F.
2171 const VU16 hi5 = Set(d, 0x1F00u);
2172 const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi5);
2173 const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi5);
2174 const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi5);
2175 const VU16 raw3 = OrAnd(down3, ShiftLeft<8>(packedB), hi5);
2176 const VU16 raw4 = OrAnd(down4, ShiftLeft<8>(packedC), hi5);
2177
2178 const VU16 raw5 = OrAnd(down5, ShiftLeft<3>(packed8), hi5);
2179 const VU16 raw6 = OrAnd(down6, ShiftLeft<3>(packed9), hi5);
2180 const VU16 raw7 = OrAnd(down7, ShiftLeft<3>(packedA), hi5);
2181 const VU16 raw8 = OrAnd(down8, ShiftLeft<3>(packed9), hi5);
2182 const VU16 raw9 = OrAnd(down9, ShiftLeft<3>(packedA), hi5);
2183
2184 const VU16 rawA = OrAnd(downA, ShiftRight<2>(packed8), hi5);
2185 const VU16 rawB = OrAnd(downB, ShiftRight<2>(packed9), hi5);
2186 const VU16 rawC = OrAnd(downC, ShiftRight<2>(packedA), hi5);
2187 const VU16 rawD = OrAnd(downD, ShiftRight<2>(packed9), hi5);
2188 const VU16 rawE = OrAnd(downE, ShiftRight<2>(packedA), hi5);
2189
2190 // Shift MSB into the top 5-of-11 and mask.
2191 const VU16 p0 = Xor3(And(ShiftRight<7>(packed8), hi5), //
2192 And(ShiftRight<6>(packed9), hi5),
2193 And(ShiftRight<5>(packedA), hi5));
2194 const VU16 p1 = Xor3(And(ShiftRight<4>(packedB), hi5),
2195 And(ShiftRight<3>(packedC), hi5), downF);
2196 const VU16 rawF = Or(p0, p1);
2197
2198 StoreU(raw0, d, raw + 0 * N);
2199 StoreU(raw1, d, raw + 1 * N);
2200 StoreU(raw2, d, raw + 2 * N);
2201 StoreU(raw3, d, raw + 3 * N);
2202 StoreU(raw4, d, raw + 4 * N);
2203 StoreU(raw5, d, raw + 5 * N);
2204 StoreU(raw6, d, raw + 6 * N);
2205 StoreU(raw7, d, raw + 7 * N);
2206 StoreU(raw8, d, raw + 8 * N);
2207 StoreU(raw9, d, raw + 9 * N);
2208 StoreU(rawA, d, raw + 0xA * N);
2209 StoreU(rawB, d, raw + 0xB * N);
2210 StoreU(rawC, d, raw + 0xC * N);
2211 StoreU(rawD, d, raw + 0xD * N);
2212 StoreU(rawE, d, raw + 0xE * N);
2213 StoreU(rawF, d, raw + 0xF * N);
2214 }
2215}; // Pack16<13>
2216
2217template <>
2218struct Pack16<14> {
2219 template <class D>
2220 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2221 uint16_t* HWY_RESTRICT packed_out) const {
2222 using VU16 = Vec<decltype(d)>;
2223 const size_t N = Lanes(d);
2224 const VU16 raw0 = LoadU(d, raw + 0 * N);
2225 const VU16 raw1 = LoadU(d, raw + 1 * N);
2226 const VU16 raw2 = LoadU(d, raw + 2 * N);
2227 const VU16 raw3 = LoadU(d, raw + 3 * N);
2228 const VU16 raw4 = LoadU(d, raw + 4 * N);
2229 const VU16 raw5 = LoadU(d, raw + 5 * N);
2230 const VU16 raw6 = LoadU(d, raw + 6 * N);
2231 const VU16 raw7 = LoadU(d, raw + 7 * N);
2232 const VU16 raw8 = LoadU(d, raw + 8 * N);
2233 const VU16 raw9 = LoadU(d, raw + 9 * N);
2234 const VU16 rawA = LoadU(d, raw + 0xA * N);
2235 const VU16 rawB = LoadU(d, raw + 0xB * N);
2236 const VU16 rawC = LoadU(d, raw + 0xC * N);
2237 const VU16 rawD = LoadU(d, raw + 0xD * N);
2238 const VU16 rawE = LoadU(d, raw + 0xE * N);
2239 const VU16 rawF = LoadU(d, raw + 0xF * N);
2240
2241 // 14 vectors, each with 14+2 bits; two raw vectors are scattered
2242 // across the upper 2 bits.
2243 const VU16 hi2 = Set(d, 0xC000u);
2244 const VU16 packed0 = Or(raw0, ShiftLeft<14>(rawE));
2245 const VU16 packed1 = OrAnd(raw1, ShiftLeft<12>(rawE), hi2);
2246 const VU16 packed2 = OrAnd(raw2, ShiftLeft<10>(rawE), hi2);
2247 const VU16 packed3 = OrAnd(raw3, ShiftLeft<8>(rawE), hi2);
2248 const VU16 packed4 = OrAnd(raw4, ShiftLeft<6>(rawE), hi2);
2249 const VU16 packed5 = OrAnd(raw5, ShiftLeft<4>(rawE), hi2);
2250 const VU16 packed6 = OrAnd(raw6, ShiftLeft<2>(rawE), hi2);
2251 const VU16 packed7 = Or(raw7, ShiftLeft<14>(rawF));
2252 const VU16 packed8 = OrAnd(raw8, ShiftLeft<12>(rawF), hi2);
2253 const VU16 packed9 = OrAnd(raw9, ShiftLeft<10>(rawF), hi2);
2254 const VU16 packedA = OrAnd(rawA, ShiftLeft<8>(rawF), hi2);
2255 const VU16 packedB = OrAnd(rawB, ShiftLeft<6>(rawF), hi2);
2256 const VU16 packedC = OrAnd(rawC, ShiftLeft<4>(rawF), hi2);
2257 const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi2);
2258
2259 StoreU(packed0, d, packed_out + 0 * N);
2260 StoreU(packed1, d, packed_out + 1 * N);
2261 StoreU(packed2, d, packed_out + 2 * N);
2262 StoreU(packed3, d, packed_out + 3 * N);
2263 StoreU(packed4, d, packed_out + 4 * N);
2264 StoreU(packed5, d, packed_out + 5 * N);
2265 StoreU(packed6, d, packed_out + 6 * N);
2266 StoreU(packed7, d, packed_out + 7 * N);
2267 StoreU(packed8, d, packed_out + 8 * N);
2268 StoreU(packed9, d, packed_out + 9 * N);
2269 StoreU(packedA, d, packed_out + 0xA * N);
2270 StoreU(packedB, d, packed_out + 0xB * N);
2271 StoreU(packedC, d, packed_out + 0xC * N);
2272 StoreU(packedD, d, packed_out + 0xD * N);
2273 }
2274
2275 template <class D>
2276 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2277 uint16_t* HWY_RESTRICT raw) const {
2278 using VU16 = Vec<decltype(d)>;
2279 const size_t N = Lanes(d);
2280
2281 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2282 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2283 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2284 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2285 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2286 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2287 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2288 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2289 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2290 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2291 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2292 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2293 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2294 const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
2295
2296 const VU16 mask = Set(d, 0x3FFFu); // Lowest 14 bits
2297
2298 const VU16 raw0 = And(packed0, mask);
2299 StoreU(raw0, d, raw + 0 * N);
2300
2301 const VU16 raw1 = And(packed1, mask);
2302 StoreU(raw1, d, raw + 1 * N);
2303
2304 const VU16 raw2 = And(packed2, mask);
2305 StoreU(raw2, d, raw + 2 * N);
2306
2307 const VU16 raw3 = And(packed3, mask);
2308 StoreU(raw3, d, raw + 3 * N);
2309
2310 const VU16 raw4 = And(packed4, mask);
2311 StoreU(raw4, d, raw + 4 * N);
2312
2313 const VU16 raw5 = And(packed5, mask);
2314 StoreU(raw5, d, raw + 5 * N);
2315
2316 const VU16 raw6 = And(packed6, mask);
2317 StoreU(raw6, d, raw + 6 * N);
2318
2319 const VU16 raw7 = And(packed7, mask);
2320 StoreU(raw7, d, raw + 7 * N);
2321
2322 const VU16 raw8 = And(packed8, mask);
2323 StoreU(raw8, d, raw + 8 * N);
2324
2325 const VU16 raw9 = And(packed9, mask);
2326 StoreU(raw9, d, raw + 9 * N);
2327
2328 const VU16 rawA = And(packedA, mask);
2329 StoreU(rawA, d, raw + 0xA * N);
2330
2331 const VU16 rawB = And(packedB, mask);
2332 StoreU(rawB, d, raw + 0xB * N);
2333
2334 const VU16 rawC = And(packedC, mask);
2335 StoreU(rawC, d, raw + 0xC * N);
2336
2337 const VU16 rawD = And(packedD, mask);
2338 StoreU(rawD, d, raw + 0xD * N);
2339
2340 // rawE is the concatenation of the top two bits in packed0..6.
2341 const VU16 E0 = Xor3(ShiftRight<14>(packed0), //
2342 ShiftRight<12>(AndNot(mask, packed1)),
2343 ShiftRight<10>(AndNot(mask, packed2)));
2344 const VU16 E1 = Xor3(ShiftRight<8>(AndNot(mask, packed3)),
2345 ShiftRight<6>(AndNot(mask, packed4)),
2346 ShiftRight<4>(AndNot(mask, packed5)));
2347 const VU16 rawE = Xor3(ShiftRight<2>(AndNot(mask, packed6)), E0, E1);
2348 const VU16 F0 = Xor3(ShiftRight<14>(AndNot(mask, packed7)),
2349 ShiftRight<12>(AndNot(mask, packed8)),
2350 ShiftRight<10>(AndNot(mask, packed9)));
2351 const VU16 F1 = Xor3(ShiftRight<8>(AndNot(mask, packedA)),
2352 ShiftRight<6>(AndNot(mask, packedB)),
2353 ShiftRight<4>(AndNot(mask, packedC)));
2354 const VU16 rawF = Xor3(ShiftRight<2>(AndNot(mask, packedD)), F0, F1);
2355 StoreU(rawE, d, raw + 0xE * N);
2356 StoreU(rawF, d, raw + 0xF * N);
2357 }
2358}; // Pack16<14>
2359
2360template <>
2361struct Pack16<15> {
2362 template <class D>
2363 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2364 uint16_t* HWY_RESTRICT packed_out) const {
2365 using VU16 = Vec<decltype(d)>;
2366 const size_t N = Lanes(d);
2367 const VU16 raw0 = LoadU(d, raw + 0 * N);
2368 const VU16 raw1 = LoadU(d, raw + 1 * N);
2369 const VU16 raw2 = LoadU(d, raw + 2 * N);
2370 const VU16 raw3 = LoadU(d, raw + 3 * N);
2371 const VU16 raw4 = LoadU(d, raw + 4 * N);
2372 const VU16 raw5 = LoadU(d, raw + 5 * N);
2373 const VU16 raw6 = LoadU(d, raw + 6 * N);
2374 const VU16 raw7 = LoadU(d, raw + 7 * N);
2375 const VU16 raw8 = LoadU(d, raw + 8 * N);
2376 const VU16 raw9 = LoadU(d, raw + 9 * N);
2377 const VU16 rawA = LoadU(d, raw + 0xA * N);
2378 const VU16 rawB = LoadU(d, raw + 0xB * N);
2379 const VU16 rawC = LoadU(d, raw + 0xC * N);
2380 const VU16 rawD = LoadU(d, raw + 0xD * N);
2381 const VU16 rawE = LoadU(d, raw + 0xE * N);
2382 const VU16 rawF = LoadU(d, raw + 0xF * N);
2383
2384 // 15 vectors, each with 15+1 bits; one packed vector is scattered
2385 // across the upper bit.
2386 const VU16 hi1 = Set(d, 0x8000u);
2387 const VU16 packed0 = Or(raw0, ShiftLeft<15>(rawF));
2388 const VU16 packed1 = OrAnd(raw1, ShiftLeft<14>(rawF), hi1);
2389 const VU16 packed2 = OrAnd(raw2, ShiftLeft<13>(rawF), hi1);
2390 const VU16 packed3 = OrAnd(raw3, ShiftLeft<12>(rawF), hi1);
2391 const VU16 packed4 = OrAnd(raw4, ShiftLeft<11>(rawF), hi1);
2392 const VU16 packed5 = OrAnd(raw5, ShiftLeft<10>(rawF), hi1);
2393 const VU16 packed6 = OrAnd(raw6, ShiftLeft<9>(rawF), hi1);
2394 const VU16 packed7 = OrAnd(raw7, ShiftLeft<8>(rawF), hi1);
2395 const VU16 packed8 = OrAnd(raw8, ShiftLeft<7>(rawF), hi1);
2396 const VU16 packed9 = OrAnd(raw9, ShiftLeft<6>(rawF), hi1);
2397 const VU16 packedA = OrAnd(rawA, ShiftLeft<5>(rawF), hi1);
2398 const VU16 packedB = OrAnd(rawB, ShiftLeft<4>(rawF), hi1);
2399 const VU16 packedC = OrAnd(rawC, ShiftLeft<3>(rawF), hi1);
2400 const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi1);
2401 const VU16 packedE = OrAnd(rawE, ShiftLeft<1>(rawF), hi1);
2402
2403 StoreU(packed0, d, packed_out + 0 * N);
2404 StoreU(packed1, d, packed_out + 1 * N);
2405 StoreU(packed2, d, packed_out + 2 * N);
2406 StoreU(packed3, d, packed_out + 3 * N);
2407 StoreU(packed4, d, packed_out + 4 * N);
2408 StoreU(packed5, d, packed_out + 5 * N);
2409 StoreU(packed6, d, packed_out + 6 * N);
2410 StoreU(packed7, d, packed_out + 7 * N);
2411 StoreU(packed8, d, packed_out + 8 * N);
2412 StoreU(packed9, d, packed_out + 9 * N);
2413 StoreU(packedA, d, packed_out + 0xA * N);
2414 StoreU(packedB, d, packed_out + 0xB * N);
2415 StoreU(packedC, d, packed_out + 0xC * N);
2416 StoreU(packedD, d, packed_out + 0xD * N);
2417 StoreU(packedE, d, packed_out + 0xE * N);
2418 }
2419
2420 template <class D>
2421 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2422 uint16_t* HWY_RESTRICT raw) const {
2423 using VU16 = Vec<decltype(d)>;
2424 const size_t N = Lanes(d);
2425
2426 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2427 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2428 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2429 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2430 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2431 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2432 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2433 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2434 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2435 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2436 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2437 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2438 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2439 const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
2440 const VU16 packedE = BitCast(d, LoadU(d, packed_in + 0xE * N));
2441
2442 const VU16 mask = Set(d, 0x7FFFu); // Lowest 15 bits
2443
2444 const VU16 raw0 = And(packed0, mask);
2445 StoreU(raw0, d, raw + 0 * N);
2446
2447 const VU16 raw1 = And(packed1, mask);
2448 StoreU(raw1, d, raw + 1 * N);
2449
2450 const VU16 raw2 = And(packed2, mask);
2451 StoreU(raw2, d, raw + 2 * N);
2452
2453 const VU16 raw3 = And(packed3, mask);
2454 StoreU(raw3, d, raw + 3 * N);
2455
2456 const VU16 raw4 = And(packed4, mask);
2457 StoreU(raw4, d, raw + 4 * N);
2458
2459 const VU16 raw5 = And(packed5, mask);
2460 StoreU(raw5, d, raw + 5 * N);
2461
2462 const VU16 raw6 = And(packed6, mask);
2463 StoreU(raw6, d, raw + 6 * N);
2464
2465 const VU16 raw7 = And(packed7, mask);
2466 StoreU(raw7, d, raw + 7 * N);
2467
2468 const VU16 raw8 = And(packed8, mask);
2469 StoreU(raw8, d, raw + 8 * N);
2470
2471 const VU16 raw9 = And(packed9, mask);
2472 StoreU(raw9, d, raw + 9 * N);
2473
2474 const VU16 rawA = And(packedA, mask);
2475 StoreU(rawA, d, raw + 0xA * N);
2476
2477 const VU16 rawB = And(packedB, mask);
2478 StoreU(rawB, d, raw + 0xB * N);
2479
2480 const VU16 rawC = And(packedC, mask);
2481 StoreU(rawC, d, raw + 0xC * N);
2482
2483 const VU16 rawD = And(packedD, mask);
2484 StoreU(rawD, d, raw + 0xD * N);
2485
2486 const VU16 rawE = And(packedE, mask);
2487 StoreU(rawE, d, raw + 0xE * N);
2488
2489 // rawF is the concatenation of the top bit in packed0..E.
2490 const VU16 F0 = Xor3(ShiftRight<15>(packed0), //
2491 ShiftRight<14>(AndNot(mask, packed1)),
2492 ShiftRight<13>(AndNot(mask, packed2)));
2493 const VU16 F1 = Xor3(ShiftRight<12>(AndNot(mask, packed3)),
2494 ShiftRight<11>(AndNot(mask, packed4)),
2495 ShiftRight<10>(AndNot(mask, packed5)));
2496 const VU16 F2 = Xor3(ShiftRight<9>(AndNot(mask, packed6)),
2497 ShiftRight<8>(AndNot(mask, packed7)),
2498 ShiftRight<7>(AndNot(mask, packed8)));
2499 const VU16 F3 = Xor3(ShiftRight<6>(AndNot(mask, packed9)),
2500 ShiftRight<5>(AndNot(mask, packedA)),
2501 ShiftRight<4>(AndNot(mask, packedB)));
2502 const VU16 F4 = Xor3(ShiftRight<3>(AndNot(mask, packedC)),
2503 ShiftRight<2>(AndNot(mask, packedD)),
2504 ShiftRight<1>(AndNot(mask, packedE)));
2505 const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4));
2506 StoreU(rawF, d, raw + 0xF * N);
2507 }
2508}; // Pack16<15>
2509
2510template <>
2511struct Pack16<16> {
2512 template <class D>
2513 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2514 uint16_t* HWY_RESTRICT packed_out) const {
2515 using VU16 = Vec<decltype(d)>;
2516 const size_t N = Lanes(d);
2517 const VU16 raw0 = LoadU(d, raw + 0 * N);
2518 const VU16 raw1 = LoadU(d, raw + 1 * N);
2519 const VU16 raw2 = LoadU(d, raw + 2 * N);
2520 const VU16 raw3 = LoadU(d, raw + 3 * N);
2521 const VU16 raw4 = LoadU(d, raw + 4 * N);
2522 const VU16 raw5 = LoadU(d, raw + 5 * N);
2523 const VU16 raw6 = LoadU(d, raw + 6 * N);
2524 const VU16 raw7 = LoadU(d, raw + 7 * N);
2525 const VU16 raw8 = LoadU(d, raw + 8 * N);
2526 const VU16 raw9 = LoadU(d, raw + 9 * N);
2527 const VU16 rawA = LoadU(d, raw + 0xA * N);
2528 const VU16 rawB = LoadU(d, raw + 0xB * N);
2529 const VU16 rawC = LoadU(d, raw + 0xC * N);
2530 const VU16 rawD = LoadU(d, raw + 0xD * N);
2531 const VU16 rawE = LoadU(d, raw + 0xE * N);
2532 const VU16 rawF = LoadU(d, raw + 0xF * N);
2533
2534 StoreU(raw0, d, packed_out + 0 * N);
2535 StoreU(raw1, d, packed_out + 1 * N);
2536 StoreU(raw2, d, packed_out + 2 * N);
2537 StoreU(raw3, d, packed_out + 3 * N);
2538 StoreU(raw4, d, packed_out + 4 * N);
2539 StoreU(raw5, d, packed_out + 5 * N);
2540 StoreU(raw6, d, packed_out + 6 * N);
2541 StoreU(raw7, d, packed_out + 7 * N);
2542 StoreU(raw8, d, packed_out + 8 * N);
2543 StoreU(raw9, d, packed_out + 9 * N);
2544 StoreU(rawA, d, packed_out + 0xA * N);
2545 StoreU(rawB, d, packed_out + 0xB * N);
2546 StoreU(rawC, d, packed_out + 0xC * N);
2547 StoreU(rawD, d, packed_out + 0xD * N);
2548 StoreU(rawE, d, packed_out + 0xE * N);
2549 StoreU(rawF, d, packed_out + 0xF * N);
2550 }
2551
2552 template <class D>
2553 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2554 uint16_t* HWY_RESTRICT raw) const {
2555 using VU16 = Vec<decltype(d)>;
2556 const size_t N = Lanes(d);
2557
2558 const VU16 raw0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2559 const VU16 raw1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2560 const VU16 raw2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2561 const VU16 raw3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2562 const VU16 raw4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2563 const VU16 raw5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2564 const VU16 raw6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2565 const VU16 raw7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2566 const VU16 raw8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2567 const VU16 raw9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2568 const VU16 rawA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2569 const VU16 rawB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2570 const VU16 rawC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2571 const VU16 rawD = BitCast(d, LoadU(d, packed_in + 0xD * N));
2572 const VU16 rawE = BitCast(d, LoadU(d, packed_in + 0xE * N));
2573 const VU16 rawF = BitCast(d, LoadU(d, packed_in + 0xF * N));
2574
2575 StoreU(raw0, d, raw + 0 * N);
2576 StoreU(raw1, d, raw + 1 * N);
2577 StoreU(raw2, d, raw + 2 * N);
2578 StoreU(raw3, d, raw + 3 * N);
2579 StoreU(raw4, d, raw + 4 * N);
2580 StoreU(raw5, d, raw + 5 * N);
2581 StoreU(raw6, d, raw + 6 * N);
2582 StoreU(raw7, d, raw + 7 * N);
2583 StoreU(raw8, d, raw + 8 * N);
2584 StoreU(raw9, d, raw + 9 * N);
2585 StoreU(rawA, d, raw + 0xA * N);
2586 StoreU(rawB, d, raw + 0xB * N);
2587 StoreU(rawC, d, raw + 0xC * N);
2588 StoreU(rawD, d, raw + 0xD * N);
2589 StoreU(rawE, d, raw + 0xE * N);
2590 StoreU(rawF, d, raw + 0xF * N);
2591 }
2592}; // Pack16<16>
2593
2594// NOLINTNEXTLINE(google-readability-namespace-comments)
2595} // namespace HWY_NAMESPACE
2596} // namespace hwy
2598
2599#endif // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
#define HWY_RESTRICT
Definition base.h:64
#define HWY_INLINE
Definition base.h:70
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:40
N
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
Definition aligned_allocator.h:27
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1682
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1622
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1756
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1837
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1975
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1922
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2050
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2132
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2276
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2220
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2363
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2421
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2553
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2513
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:647
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:611
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:708
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:744
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:847
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:806
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:951
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:914
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1059
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1015
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1179
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1132
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1303
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1253
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1381
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1424
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1549
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1492
Definition bit_pack-inl.h:39
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:67
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:44
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:129
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:105
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:198
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:168
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:271
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:244
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:312
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:347
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:428
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:396
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:475
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:509
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:584
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:560
Definition bit_pack-inl.h:37