BitMagic-C++
bmsse2.h
Go to the documentation of this file.
1#ifndef BMSSE2__H__INCLUDED__
2#define BMSSE2__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse2.h
22 \brief Compute functions for SSE2 SIMD instruction set (internal)
23*/
24
25#include<mmintrin.h>
26#include<emmintrin.h>
27
28#include "bmdef.h"
29#include "bmsse_util.h"
30#include "bmutil.h"
31
32
33#ifdef __GNUG__
34#pragma GCC diagnostic push
35#pragma GCC diagnostic ignored "-Wconversion"
36#endif
37
38namespace bm
39{
40
41
42/*!
43 SSE2 optimized bitcounting function implements parallel bitcounting
44 algorithm for SSE2 instruction set.
45
46<pre>
47unsigned CalcBitCount32(unsigned b)
48{
49 b = (b & 0x55555555) + (b >> 1 & 0x55555555);
50 b = (b & 0x33333333) + (b >> 2 & 0x33333333);
51 b = (b + (b >> 4)) & 0x0F0F0F0F;
52 b = b + (b >> 8);
53 b = (b + (b >> 16)) & 0x0000003F;
54 return b;
55}
56</pre>
57
58 @ingroup SSE2
59
60*/
61inline
62bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
63{
64 const unsigned mu1 = 0x55555555;
65 const unsigned mu2 = 0x33333333;
66 const unsigned mu3 = 0x0F0F0F0F;
67 const unsigned mu4 = 0x0000003F;
68
69 // Loading masks
70 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
71 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
72 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
73 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
74 __m128i mcnt;
75 mcnt = _mm_xor_si128(m1, m1); // cnt = 0
76
77 __m128i tmp1, tmp2;
78 do
79 {
80 __m128i b = _mm_load_si128(block);
81 ++block;
82
83 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
84 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
85 tmp1 = _mm_and_si128(tmp1, m1);
86 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
87 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
88
89 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
90 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
91 tmp1 = _mm_and_si128(tmp1, m2);
92 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
93 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
94
95 // b = (b + (b >> 4)) & 0x0F0F0F0F;
96 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
97 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
98 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
99
100 // b = b + (b >> 8);
101 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
102 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
103
104 // b = (b + (b >> 16)) & 0x0000003F;
105 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
106 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
107 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
108
109 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
110
111 } while (block < block_end);
112
113
115 _mm_store_si128((__m128i*)tcnt, mcnt);
116
117 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
118}
119
120
121
122template<class Func>
124 const __m128i* BMRESTRICT block_end,
125 const __m128i* BMRESTRICT mask_block,
126 Func sse2_func)
127{
128 const unsigned mu1 = 0x55555555;
129 const unsigned mu2 = 0x33333333;
130 const unsigned mu3 = 0x0F0F0F0F;
131 const unsigned mu4 = 0x0000003F;
132
133 // Loading masks
134 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
135 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
136 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
137 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
138 __m128i mcnt;
139 mcnt = _mm_xor_si128(m1, m1); // cnt = 0
140 do
141 {
142 __m128i tmp1, tmp2;
143 __m128i b = _mm_load_si128(block++);
144
145 tmp1 = _mm_load_si128(mask_block++);
146
147 b = sse2_func(b, tmp1);
148
149 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
150 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
151 tmp1 = _mm_and_si128(tmp1, m1);
152 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
153 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
154
155 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
156 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
157 tmp1 = _mm_and_si128(tmp1, m2);
158 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
159 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
160
161 // b = (b + (b >> 4)) & 0x0F0F0F0F;
162 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
163 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
164 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
165
166 // b = b + (b >> 8);
167 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
168 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
169
170 // b = (b + (b >> 16)) & 0x0000003F;
171 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
172 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
173 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
174
175 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
176
177 } while (block < block_end);
178
180 _mm_store_si128((__m128i*)tcnt, mcnt);
181
182 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
183}
184
185
186inline
188 const __m128i* BMRESTRICT block_end,
189 unsigned* BMRESTRICT bit_count)
190{
191 const unsigned mu1 = 0x55555555;
192 const unsigned mu2 = 0x33333333;
193 const unsigned mu3 = 0x0F0F0F0F;
194 const unsigned mu4 = 0x0000003F;
195
196 // Loading masks
197 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
198 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
199 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
200 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
201 __m128i mcnt;//, ccnt;
202 mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
203 //ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
204
205 __m128i tmp1, tmp2;
206
207 int count = (int)(block_end - block)*4; //0;//1;
208
209 bm::word_t w, w0, w_prev;//, w_l;
210 const int w_shift = sizeof(w) * 8 - 1;
211 bool first_word = true;
212
213 // first word
214 {
215 const bm::word_t* blk = (const bm::word_t*) block;
216 w = w0 = blk[0];
217 w ^= (w >> 1);
218 BM_INCWORD_BITCOUNT(count, w);
219 count -= (w_prev = (w0 >> w_shift)); // negative value correction
220 }
221
223
224 do
225 {
226 // compute bit-count
227 // ---------------------------------------------------------------------
228 {
229 __m128i b = _mm_load_si128(block);
230
231 // w ^(w >> 1)
232 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1
233 tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b;
234 _mm_store_si128((__m128i*)tcnt, tmp2);
235
236
237 // compare with zero
238 // SSE4: _mm_test_all_zero()
239 {
240 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
241 //tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
242 tmp1 = _mm_and_si128(tmp1, m1);
243 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
244 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
245
246 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
247 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
248 tmp1 = _mm_and_si128(tmp1, m2);
249 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
250 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
251
252 // b = (b + (b >> 4)) & 0x0F0F0F0F;
253 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
254 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
255 b = _mm_and_si128(b, m3); //& 0x0F0F0F0F
256
257 // b = b + (b >> 8);
258 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
259 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
260
261 // b = (b + (b >> 16)) & 0x0000003F;
262 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
263 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
264 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
265
266 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
267 }
268
269 }
270 // ---------------------------------------------------------------------
271 {
272 //__m128i b = _mm_load_si128(block);
273 // TODO: SSE4...
274 //w = _mm_extract_epi32(b, i);
275
276 const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
277
278 if (first_word)
279 {
280 first_word = false;
281 }
282 else
283 {
284 if (0!=(w0=blk[0]))
285 {
286 BM_INCWORD_BITCOUNT(count, tcnt[0]);
287 count -= !(w_prev ^ (w0 & 1));
288 count -= w_prev = (w0 >> w_shift);
289 }
290 else
291 {
292 count -= !w_prev; w_prev ^= w_prev;
293 }
294 }
295 if (0!=(w0=blk[1]))
296 {
297 BM_INCWORD_BITCOUNT(count, tcnt[1]);
298 count -= !(w_prev ^ (w0 & 1));
299 count -= w_prev = (w0 >> w_shift);
300 }
301 else
302 {
303 count -= !w_prev; w_prev ^= w_prev;
304 }
305 if (0!=(w0=blk[2]))
306 {
307 BM_INCWORD_BITCOUNT(count, tcnt[2]);
308 count -= !(w_prev ^ (w0 & 1));
309 count -= w_prev = (w0 >> w_shift);
310 }
311 else
312 {
313 count -= !w_prev; w_prev ^= w_prev;
314 }
315 if (0!=(w0=blk[3]))
316 {
317 BM_INCWORD_BITCOUNT(count, tcnt[3]);
318 count -= !(w_prev ^ (w0 & 1));
319 count -= w_prev = (w0 >> w_shift);
320 }
321 else
322 {
323 count -= !w_prev; w_prev ^= w_prev;
324 }
325 }
326 } while (++block < block_end);
327
328 _mm_store_si128((__m128i*)tcnt, mcnt);
329 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
330
331 return unsigned(count);
332}
333
334#ifdef __GNUG__
335// necessary measure to silence false warning from GCC about negative pointer arithmetics
336#pragma GCC diagnostic push
337#pragma GCC diagnostic ignored "-Warray-bounds"
338#endif
339
340/*!
341SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
342\internal
343*/
344inline
345unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
346{
347 BM_ASSERT(size <= 16);
348 BM_ASSERT(size);
349
350 const unsigned unroll_factor = 8;
351 if (size < 4) // for very short vector use conventional scan
352 {
353 unsigned j;
354 for (j = 0; j < size; ++j)
355 {
356 if (pbuf[j] >= pos)
357 break;
358 }
359 return j;
360 }
361
362 __m128i m1, mz, maskF, maskFL;
363
364 mz = _mm_setzero_si128();
365 m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
366
367 maskF = _mm_cmpeq_epi32(mz, mz); // set all FF
368 maskFL = _mm_slli_si128(maskF, 4 * 2); // byle shift to make [0000 FFFF]
369 int shiftL = (64 - (unroll_factor - size) * 16);
370 maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
371
372 m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
373 m1 = _mm_or_si128(m1, maskFL);
374
375 __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
376 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
377 int mi = _mm_movemask_epi8(mge_mask); // collect flag bits
378 if (mi)
379 {
380 int bsr_i= bm::bit_scan_fwd(mi) >> 1;
381 return bsr_i; // address of first one element (target)
382 }
383 if (size == 8)
384 return size;
385
386 // inspect the next lane with possible step back (to avoid over-read the block boundaries)
387 // GCC gives a false warning for "- unroll_factor" here
388 const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
389 BM_ASSERT(pbuf2 > pbuf); // assert in place to make sure GCC warning is indeed false
390
391 m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
392 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
393 mi = _mm_movemask_epi8(mge_mask);
394 if (mi)
395 {
396 int bsr_i = bm::bit_scan_fwd(mi) >> 1;
397 return size - (unroll_factor - bsr_i);
398 }
399 return size;
400}
401
402/**
403 Hybrid binary search, starts as binary, then switches to linear scan
404
405 \param buf - GAP buffer pointer.
406 \param pos - index of the element.
407 \param is_set - output. GAP value (0 or 1).
408 \return GAP index.
409
410 @ingroup SSE2
411*/
412inline
413unsigned sse2_gap_bfind(const unsigned short* BMRESTRICT buf,
414 unsigned pos, unsigned* BMRESTRICT is_set)
415{
416 unsigned start = 1;
417 unsigned end = 1 + ((*buf) >> 3);
418 unsigned dsize = end - start;
419
420 if (dsize < 17)
421 {
422 start = bm::sse2_gap_find(buf+1, (bm::gap_word_t)pos, dsize);
423 *is_set = ((*buf) & 1) ^ (start & 1);
424 BM_ASSERT(buf[start+1] >= pos);
425 BM_ASSERT(buf[start] < pos || (start==0));
426
427 return start+1;
428 }
429 unsigned arr_end = end;
430 while (start != end)
431 {
432 unsigned curr = (start + end) >> 1;
433 if (buf[curr] < pos)
434 start = curr + 1;
435 else
436 end = curr;
437
438 unsigned size = end - start;
439 if (size < 16)
440 {
441 size += (end != arr_end);
442 unsigned idx =
443 bm::sse2_gap_find(buf + start, (bm::gap_word_t)pos, size);
444 start += idx;
445
446 BM_ASSERT(buf[start] >= pos);
447 BM_ASSERT(buf[start - 1] < pos || (start == 1));
448 break;
449 }
450 }
451
452 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
453 return start;
454}
455
456/**
457 Hybrid binary search, starts as binary, then switches to scan
458 @ingroup SSE2
459*/
460inline
461unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
462{
463 unsigned is_set;
464 bm::sse2_gap_bfind(buf, pos, &is_set);
465 return is_set;
466}
467
468
469#ifdef __GNUG__
470#pragma GCC diagnostic pop
471#endif
472
473
474#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
475 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
476
477#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
478 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
479
480#define VECT_BITCOUNT(first, last) \
481 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
482
483#define VECT_BITCOUNT_AND(first, last, mask) \
484 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
485
486#define VECT_BITCOUNT_OR(first, last, mask) \
487 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
488
489#define VECT_BITCOUNT_XOR(first, last, mask) \
490 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
491
492#define VECT_BITCOUNT_SUB(first, last, mask) \
493 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
494
495#define VECT_INVERT_BLOCK(first) \
496 sse2_invert_block((__m128i*)first);
497
498#define VECT_AND_BLOCK(dst, src) \
499 sse2_and_block((__m128i*) dst, (__m128i*) (src))
500
501#define VECT_OR_BLOCK(dst, src) \
502 sse2_or_block((__m128i*) dst, (__m128i*) (src))
503
504#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
505 sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
506
507#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
508 sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
509
510#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
511 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
512
513#define VECT_SUB_BLOCK(dst, src) \
514 sse2_sub_block((__m128i*) dst, (__m128i*) (src))
515
516#define VECT_XOR_BLOCK(dst, src) \
517 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
518
519#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
520 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
521
522#define VECT_COPY_BLOCK(dst, src) \
523 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
524
525#define VECT_STREAM_BLOCK(dst, src) \
526 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
527
528#define VECT_SET_BLOCK(dst, value) \
529 sse2_set_block((__m128i*) dst, value)
530
531#define VECT_GAP_BFIND(buf, pos, is_set) \
532 sse2_gap_bfind(buf, pos, is_set)
533
534
535} // namespace
536
537
538#ifdef __GNUG__
539#pragma GCC diagnostic pop
540#endif
541
542
543#endif
Definitions(internal)
#define BM_ALIGN16
Definition bmdef.h:276
#define BMRESTRICT
Definition bmdef.h:193
#define BM_ALIGN16ATTR
Definition bmdef.h:277
#define BM_ASSERT
Definition bmdef.h:130
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
Definition bmsse2.h:62
unsigned sse2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
Definition bmsse2.h:413
unsigned sse2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition bmsse2.h:461
#define BM_INCWORD_BITCOUNT(cnt, w)
Definition bmdef.h:386
Definition bm.h:77
bm::id_t sse2_bit_block_calc_count_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, unsigned *BMRESTRICT bit_count)
Definition bmsse2.h:187
unsigned int word_t
Definition bmconst.h:38
unsigned sse2_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition bmsse2.h:345
bm::id_t sse2_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition bmsse2.h:123
unsigned int id_t
Definition bmconst.h:37
unsigned short gap_word_t
Definition bmconst.h:77
T bit_scan_fwd(T v) BMNOEXCEPT
Definition bmutil.h:294