BitMagic-C++
bmsparsevec_algo.h
Go to the documentation of this file.
1#ifndef BMSPARSEVEC_ALGO__H__INCLUDED__
2#define BMSPARSEVEC_ALGO__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20/*! \file bmsparsevec_algo.h
21 \brief Algorithms for bm::sparse_vector
22*/
23
24#ifndef BM__H__INCLUDED__
25// BitMagic utility headers do not include main "bm.h" declaration
26// #include "bm.h" or "bm64.h" explicitly
27# error missing include (bm.h or bm64.h)
28#endif
29
30#include "bmdef.h"
31#include "bmsparsevec.h"
32#include "bmaggregator.h"
33#include "bmbuffer.h"
34#include "bmalgo.h"
35#include "bmdef.h"
36
37#ifdef _MSC_VER
38# pragma warning( disable: 4146 )
39#endif
40
41
42
43/** \defgroup svalgo Sparse vector algorithms
44 Sparse vector algorithms
45 \ingroup svector
46 */
47
48
49namespace bm
50{
51
52
53/*!
54 \brief Clip dynamic range for signal higher than specified
55
56 \param svect - sparse vector to do clipping
57 \param high_bit - max bit (inclusive) allowed for this signal vector
58
59
60 \ingroup svalgo
61 \sa dynamic_range_clip_low
62*/
63template<typename SV>
64void dynamic_range_clip_high(SV& svect, unsigned high_bit)
65{
66 unsigned sv_plains = svect.plains();
67
68 BM_ASSERT(sv_plains > high_bit && high_bit > 0);
69
70 typename SV::bvector_type bv_acc;
71 unsigned i;
72
73 // combine all the high bits into accumulator vector
74 for (i = high_bit+1; i < sv_plains; ++i)
75 {
76 typename SV::bvector_type* bv_plain = svect.plain(i);
77 if (bv_plain)
78 {
79 bv_acc.bit_or(*bv_plain);
80 svect.free_plain(i);
81 }
82 } // for i
83
84 // set all bits ON for all low vectors, which happen to be clipped
85 for (i = high_bit; true; --i)
86 {
87 typename SV::bvector_type* bv_plain = svect.get_plain(i);
88 bv_plain->bit_or(bv_acc);
89 if (i == 0)
90 break;
91 } // for i
92}
93
94
95/*!
96 \brief Clip dynamic range for signal lower than specified (boost)
97
98 \param svect - sparse vector to do clipping
99 \param low_bit - low bit (inclusive) allowed for this signal vector
100
101 \ingroup svalgo
102 \sa dynamic_range_clip_high
103*/
104template<typename SV>
105void dynamic_range_clip_low(SV& svect, unsigned low_bit)
106{
107 if (low_bit == 0) return; // nothing to do
108 BM_ASSERT(svect.plains() > low_bit);
109
110 unsigned sv_plains = svect.plains();
111 typename SV::bvector_type bv_acc1;
112 unsigned i;
113
114 // combine all the high bits into accumulator vector
115 for (i = low_bit+1; i < sv_plains; ++i)
116 {
117 typename SV::bvector_type* bv_plain = svect.plain(i);
118 if (bv_plain)
119 bv_acc1.bit_or(*bv_plain);
120 } // for i
121
122 // accumulate all vectors below the clipping point
123 typename SV::bvector_type bv_acc2;
124 typename SV::bvector_type* bv_low_plain = svect.get_plain(low_bit);
125
126 for (i = low_bit-1; true; --i)
127 {
128 typename SV::bvector_type* bv_plain = svect.plain(i);
129 if (bv_plain)
130 {
131 bv_acc2.bit_or(*bv_plain);
132 svect.free_plain(i);
133 if (i == 0)
134 break;
135 }
136 } // for i
137
138 // now we want to set 1 in the clipping low plain based on
139 // exclusive or (XOR) between upper and lower parts)
140 // as a result high signal (any bits in the upper plains) gets
141 // slightly lower value (due to clipping) low signal gets amplified
142 // (lower contrast algorithm)
143
144 bv_acc1.bit_xor(bv_acc2);
145 bv_low_plain->bit_or(bv_acc1);
146}
147
148/**
149 Find first mismatch (element which is different) between two sparse vectors
150 (uses linear scan in bit-vector plains)
151
152 Function works with both NULL and NOT NULL vectors
153 NULL means unassigned (uncertainty), so first mismatch NULL is a mismatch
154 even if values in vectors can be formally the same (i.e. 0)
155
156 @param sv1 - vector 1
157 @param sv2 - vector 2
158 @param midx - mismatch index
159 @param null_proc - defines if we want to include (not) NULL
160 vector into comparison (bm::use_null) or not.
161 By default search takes NULL vector into account
162
163 @return true if mismatch found
164
165 @sa sparse_vector_find_mismatch
166
167 \ingroup svalgo
168*/
169template<typename SV>
171 const SV& sv2,
172 typename SV::size_type& midx,
173 bm::null_support null_proc = bm::use_null)
174{
175 typename SV::size_type mismatch = bm::id_max;
176 bool found = false;
177
178 unsigned sv_idx = 0;
179
180 unsigned plains1 = sv1.plains();
181 BM_ASSERT(plains1);
182
183 // for RSC vector do NOT compare NULL plains
184
186 {
187 //--plains1;
188 }
189 else // regular sparse vector - may have NULL plains
190 {
191 if (null_proc == bm::use_null)
192 {
193 typename SV::bvector_type_const_ptr bv_null1 = sv1.get_null_bvector();
194 typename SV::bvector_type_const_ptr bv_null2 = sv2.get_null_bvector();
195 if (bv_null1 && bv_null2) // both (not) NULL vectors present
196 {
197 bool f = bv_null1->find_first_mismatch(*bv_null2, midx, mismatch);
198 if (f && (midx < mismatch)) // better mismatch found
199 {
200 found = f; mismatch = midx;
201 }
202
203 }
204 else // one or both NULL vectors are not present
205 {
206 if (bv_null1)
207 {
208 typename SV::bvector_type bv_tmp; // TODO: get rid of temp bv
209 bv_tmp.resize(sv2.size());
210 bv_tmp.invert(); // turn into true NULL vector
211
212 // find first NULL value (mismatch)
213 bool f = bv_null1->find_first_mismatch(bv_tmp, midx, mismatch);
214 if (f && (midx < mismatch)) // better mismatch found
215 {
216 found = f; mismatch = midx;
217 }
218 }
219 if (bv_null2)
220 {
221 typename SV::bvector_type bv_tmp; // TODO: get rid of temp bv
222 bv_tmp.resize(sv1.size());
223 bv_tmp.invert();
224
225 bool f = bv_null2->find_first_mismatch(bv_tmp, midx, mismatch);
226 if (f && (midx < mismatch)) // better mismatch found
227 {
228 found = f; mismatch = midx;
229 }
230 }
231 }
232 } // null_proc
233 }
234
235 for (unsigned i = 0; mismatch & (i < plains1); ++i)
236 {
237 typename SV::bvector_type_const_ptr bv1 = sv1.get_plain(i);
238 typename SV::bvector_type_const_ptr bv2 = sv2.get_plain(i);
239 if (!bv1)
240 {
241 if (!bv2)
242 continue;
243 bool f = bv2->find(midx);
244 if (f && (midx < mismatch))
245 {
246 found = f; sv_idx = 2; mismatch = midx;
247 }
248 continue;
249 }
250 if (!bv2)
251 {
252 BM_ASSERT(bv1);
253 bool f = bv1->find(midx);
254 if (f && (midx < mismatch))
255 {
256 found = f; sv_idx = 1; mismatch = midx;
257 }
258 continue;
259 }
260 // both plains are not NULL
261 //
262 bool f = bv1->find_first_mismatch(*bv2, midx, mismatch);
263 if (f && (midx < mismatch)) // better mismatch found
264 {
265 found = f; mismatch = midx;
266 // which vector has '1' at mismatch position?
268 sv_idx = (bv1->test(mismatch)) ? 1 : 2;
269 }
270
271 } // for i
272
273 // RSC address translation here
274 //
276 {
277 if (found) // RSC address translation
278 {
279 BM_ASSERT(sv1.is_compressed());
280 BM_ASSERT(sv2.is_compressed());
281
282 switch (sv_idx)
283 {
284 case 1:
285 found = sv1.find_rank(midx + 1, mismatch);
286 break;
287 case 2:
288 found = sv2.find_rank(midx + 1, mismatch);
289 break;
290 default: // unknown, try both
291 BM_ASSERT(0);
292 }
293 BM_ASSERT(found);
294 }
295 else // search for mismatch in the NOT NULL vectors
296 {
297 if (null_proc == bm::use_null)
298 {
299 // no need for address translation in this case
300 typename SV::bvector_type_const_ptr bv_null1 = sv1.get_null_bvector();
301 typename SV::bvector_type_const_ptr bv_null2 = sv2.get_null_bvector();
302 found = bv_null1->find_first_mismatch(*bv_null2, mismatch);
303 }
304 }
305 }
306
307 midx = mismatch; // minimal mismatch
308 return found;
309}
310
311/**
312 Find mismatch vector, indicating positions of mismatch between two sparse vectors
313 (uses linear scan in bit-vector plains)
314
315 Function works with both NULL and NOT NULL vectors
316
317 @param bv - [out] - bit-ector with mismatch positions indicated as 1s
318 @param sv1 - vector 1
319 @param sv2 - vector 2
320 @param null_proc - rules of processing for (not) NULL plain
321 bm::no_null - NULLs from both vectors are treated as uncertainty
322 and NOT included into final result
323 bm::use_null - difference in NULLs accounted into the result
324
325 @sa sparse_vector_find_first_mismatch
326
327 \ingroup svalgo
328*/
329template<typename SV1, typename SV2>
330void sparse_vector_find_mismatch(typename SV1::bvector_type& bv,
331 const SV1& sv1,
332 const SV2& sv2,
333 bm::null_support null_proc)
334{
335 typedef typename SV1::bvector_type bvector_type;
336 typedef typename bvector_type::allocator_type allocator_type;
337 typedef typename allocator_type::allocator_pool_type allocator_pool_type;
338
339 allocator_pool_type pool; // local pool for blocks
340 typename bvector_type::mem_pool_guard mp_guard_bv;
341 mp_guard_bv.assign_if_not_set(pool, bv);
342
343
345 {
346 BM_ASSERT(0); // TODO: fixme
347 }
349 {
350 BM_ASSERT(0); // TODO: fixme
351 }
352
353 bv.clear();
354
355 unsigned plains = sv1.plains();
356 if (plains < sv2.plains())
357 plains = sv2.plains();
358
359 for (unsigned i = 0; i < plains; ++i)
360 {
361 typename SV1::bvector_type_const_ptr bv1 = sv1.get_plain(i);
362 typename SV2::bvector_type_const_ptr bv2 = sv2.get_plain(i);
363
364 if (!bv1)
365 {
366 if (!bv2)
367 continue;
368 bv |= *bv2;
369 continue;
370 }
371 if (!bv2)
372 {
373 BM_ASSERT(bv1);
374 bv |= *bv1;
375 continue;
376 }
377
378 // both plains are not NULL, compute XOR diff
379 //
380 bvector_type bv_xor;
381 typename bvector_type::mem_pool_guard mp_guard;
382 mp_guard.assign_if_not_set(pool, bv_xor);
383
384 bv_xor.bit_xor(*bv1, *bv2, SV1::bvector_type::opt_none);
385 bv |= bv_xor;
386
387 } // for i
388
389 // size mismatch check
390 {
391 typename SV1::size_type sz1 = sv1.size();
392 typename SV2::size_type sz2 = sv2.size();
393 if (sz1 != sz2)
394 {
395 if (sz1 < sz2)
396 {
397 }
398 else
399 {
400 bm::xor_swap(sz1, sz2);
401 }
402 bv.set_range(sz1, sz2-1);
403 }
404 }
405
406 // NULL processings
407 //
408 typename SV1::bvector_type_const_ptr bv_null1 = sv1.get_null_bvector();
409 typename SV2::bvector_type_const_ptr bv_null2 = sv2.get_null_bvector();
410
411 switch (null_proc)
412 {
413 case bm::no_null:
414 // NULL correction to exclude all NULL (unknown) values from the result set
415 // (AND with NOT NULL vector)
416 if (bv_null1 && bv_null2)
417 {
418 bvector_type bv_or;
419 typename bvector_type::mem_pool_guard mp_guard;
420 mp_guard.assign_if_not_set(pool, bv_or);
421
422 bv_or.bit_or(*bv_null1, *bv_null2, bvector_type::opt_none);
423 bv &= bv_or;
424 }
425 else
426 {
427 if (bv_null1)
428 bv &= *bv_null1;
429 if (bv_null2)
430 bv &= *bv_null2;
431 }
432 break;
433 case bm::use_null:
434 if (bv_null1 && bv_null2)
435 {
436 bvector_type bv_xor;
437 typename bvector_type::mem_pool_guard mp_guard;
438 mp_guard.assign_if_not_set(pool, bv_xor);
439
440 bv_xor.bit_xor(*bv_null1, *bv_null2, bvector_type::opt_none);
441 bv |= bv_xor;
442 }
443 else
444 {
445 bvector_type bv_null;
446 typename bvector_type::mem_pool_guard mp_guard;
447 mp_guard.assign_if_not_set(pool, bv_null);
448 if (bv_null1)
449 {
450 bv_null = *bv_null1;
451 bv_null.resize(sv1.size());
452 }
453 if (bv_null2)
454 {
455 bv_null = *bv_null2;
456 bv_null.resize(sv2.size());
457 }
458 bv_null.invert();
459 bv |= bv_null;
460 }
461 break;
462 default:
463 BM_ASSERT(0);
464 }
465}
466
467
468/**
469 \brief algorithms for sparse_vector scan/search
470
471 Scanner uses properties of bit-vector plains to answer questions
472 like "find all sparse vector elements equivalent to XYZ".
473
474 Class uses fast algorithms based on properties of bit-plains.
475 This is NOT a brute force, direct scan.
476
477 @ingroup svalgo
478 @ingroup setalgo
479*/
480template<typename SV>
482{
483public:
484 typedef typename SV::bvector_type bvector_type;
487 typedef typename SV::value_type value_type;
488 typedef typename SV::size_type size_type;
489
491 typedef typename allocator_type::allocator_pool_type allocator_pool_type;
492
493public:
495
496 /**
497 \brief bind sparse vector for all searches
498
499 \param sv - input sparse vector to bind for searches
500 \param sorted - source index is sorted, build index for binary search
501 */
502 void bind(const SV& sv, bool sorted);
503
504 /**
505 \brief reset sparse vector binding
506 */
508
509 /**
510 \brief find all sparse vector elements EQ to search value
511
512 Find all sparse vector elements equivalent to specified value
513
514 \param sv - input sparse vector
515 \param value - value to search for
516 \param bv_out - output bit-vector (search result masks 1 elements)
517 */
518 void find_eq(const SV& sv,
519 typename SV::value_type value,
520 typename SV::bvector_type& bv_out);
521
522 /**
523 \brief find first sparse vector element
524
525 Find all sparse vector elements equivalent to specified value.
526 Works well if sperse vector represents unordered set
527
528 \param sv - input sparse vector
529 \param value - value to search for
530 \param pos - output found sparse vector element index
531
532 \return true if found
533 */
534 bool find_eq(const SV& sv,
535 typename SV::value_type value,
536 typename SV::size_type& pos);
537
538 /**
539 \brief find first sparse vector element (string)
540 */
541 bool find_eq_str(const SV& sv,
542 const typename SV::value_type* str,
543 typename SV::size_type& pos);
544
545 /**
546 \brief binary find first sparse vector element (string)
547 Sparse vector must be attached (bind())
548 @sa bind
549 */
550 bool find_eq_str(const typename SV::value_type* str,
551 typename SV::size_type& pos);
552
553 /**
554 \brief binary find first sparse vector element (string)
555 Sparse vector must be sorted.
556 */
557 bool bfind_eq_str(const SV& sv,
558 const typename SV::value_type* str,
559 typename SV::size_type& pos);
560
561 /**
562 \brief lower bound search for an array position
563
564 Method assumes the sparse array is sorted
565
566 \param sv - input sparse vector
567 \param val - value to search for
568 \param pos - output sparse vector element index
569
570 \return true if value found
571 */
572 bool lower_bound(const SV& sv,
573 const typename SV::value_type val,
574 typename SV::size_type& pos);
575 /**
576 \brief lower bound search for an array position
577
578 Method assumes the sparse array is sorted
579
580 \param sv - input sparse vector
581 \param str - value to search for
582 \param pos - output sparse vector element index
583
584 \return true if value found
585 */
586 bool lower_bound_str(const SV& sv,
587 const typename SV::value_type* str,
588 typename SV::size_type& pos);
589
590 /**
591 \brief binary find first sparse vector element (string)
592 Sparse vector must be sorted and attached
593 @sa bind
594 */
595 bool bfind_eq_str(const typename SV::value_type* str,
596 typename SV::size_type& pos);
597
598 /**
599 \brief find all sparse vector elements EQ to 0
600 \param sv - input sparse vector
601 \param bv_out - output bit-vector (search result masks 1 elements)
602 */
603 void find_zero(const SV& sv,
604 typename SV::bvector_type& bv_out);
605
606 /*!
607 \brief Find non-zero elements
608 Output vector is computed as a logical OR (join) of all plains
609
610 \param sv - input sparse vector
611 \param bv_out - output bit-bector of non-zero elements
612 */
613 void find_nonzero(const SV& sv, typename SV::bvector_type& bv_out);
614
615 /**
616 \brief invert search result ("EQ" to "not EQ")
617
618 \param sv - input sparse vector
619 \param bv_out - output bit-bector of non-zero elements
620 */
621 void invert(const SV& sv, typename SV::bvector_type& bv_out);
622
623 /**
624 \brief find all values A IN (C, D, E, F)
625 \param sv - input sparse vector
626 \param start - start iterator (set to search)
627 \param end - end iterator (set to search)
628 \param bv_out - output bit-bector of non-zero elements
629 */
630 template<typename It>
631 void find_eq(const SV& sv,
632 It start,
633 It end,
634 typename SV::bvector_type& bv_out)
635 {
636 typename bvector_type::mem_pool_guard mp_guard;
637 mp_guard.assign_if_not_set(pool_, bv_out); // set algorithm-local memory pool to avoid heap contention
638
639 bvector_type bv1;
640 typename bvector_type::mem_pool_guard mp_guard1(pool_, bv1);
641 bool any_zero = false;
642 for (; start < end; ++start)
643 {
644 value_type v = *start;
645 any_zero |= (v == 0);
646 bool found = find_eq_with_nulls(sv, v, bv1);
647 if (found)
648 bv_out.bit_or(bv1);
649 else
650 {
651 BM_ASSERT(!bv1.any());
652 }
653 } // for
654 if (any_zero)
655 correct_nulls(sv, bv_out);
656 }
657
658 /// For testing purposes only
659 ///
660 /// @internal
661 void find_eq_with_nulls_horizontal(const SV& sv,
662 typename SV::value_type value,
663 typename SV::bvector_type& bv_out);
664
665 /** Exclude possible NULL values from the result vector
666 \param sv - input sparse vector
667 \param bv_out - output bit-bector of non-zero elements
668 */
669 void correct_nulls(const SV& sv, typename SV::bvector_type& bv_out);
670
671protected:
672
673 /// set search boundaries (hint for the aggregator)
674 void set_search_range(size_type from, size_type to);
675
676 /// reset (disable) search range
677 void reset_search_range();
678
679 /// find value (may include NULL indexes)
680 bool find_eq_with_nulls(const SV& sv,
681 typename SV::value_type value,
682 typename SV::bvector_type& bv_out,
683 typename SV::size_type search_limit = 0);
684
685 /// find first value (may include NULL indexes)
686 bool find_first_eq(const SV& sv,
687 typename SV::value_type value,
688 size_type& idx);
689
690 /// find first string value (may include NULL indexes)
691 bool find_first_eq(const SV& sv,
692 const typename SV::value_type* str,
693 size_type& idx,
694 bool remaped);
695
696
697 /// Prepare aggregator for AND-SUB (EQ) search
698 bool prepare_and_sub_aggregator(const SV& sv,
699 typename SV::value_type value);
700
701 /// Prepare aggregator for AND-SUB (EQ) search (string)
702 bool prepare_and_sub_aggregator(const SV& sv,
703 const typename SV::value_type* str,
704 unsigned octet_start);
705
706 /// Rank-Select decompression for RSC vectors
707 void decompress(const SV& sv, typename SV::bvector_type& bv_out);
708
709 /// compare sv[idx] with input str
710 int compare_str(const SV& sv, size_type idx, const value_type* str);
711
712 /// compare sv[idx] with input value
713 int compare(const SV& sv, size_type idx, const value_type val) BMNOEXCEPT;
714
715protected:
717 void operator=(const sparse_vector_scanner&) = delete;
718
719protected:
720
722 {
723 max_columns = SV::max_vector_size
724 };
725
731
732 typedef bm::dynamic_heap_matrix<value_type, allocator_type> heap_matrix_type;
733 typedef bm::heap_matrix<typename SV::value_type,
735 SV::sv_octet_plains,
737
738
739private:
741 bvector_type bv_tmp_;
744
745 size_type mask_from_;
746 size_type mask_to_;
747 bool mask_set_;
748
749 const SV* bound_sv_;
750 heap_matrix_type block0_elements_cache_; ///< cache for elements[0] of each block
751 heap_matrix_type block3_elements_cache_; ///< cache for elements[16384x] of each block
752 size_type effective_str_max_;
753
754 value_type remap_value_vect_[SV::max_vector_size];
755 /// masks of allocated bit-plains (1 - means there is a bit-plain)
756 bm::id64_t vector_plain_masks_[SV::max_vector_size];
757 matrix_search_buf_type hmatr_; ///< heap matrix for string search linear stage
758};
759
760
761/*!
762 \brief Integer set to set transformation (functional image in groups theory)
763 https://en.wikipedia.org/wiki/Image_(mathematics)
764
765 Input sets gets translated through the function, which is defined as
766 "one to one (or NULL)" binary relation object (sparse_vector).
767 Also works for M:1 relationships.
768
769 \ingroup svalgo
770 \ingroup setalgo
771*/
772template<typename SV>
774{
775public:
776 typedef typename SV::bvector_type bvector_type;
777 typedef typename SV::value_type value_type;
778 typedef typename SV::size_type size_type;
780public:
781
784
785
786 /** Get read access to zero-elements vector
787 Zero vector gets populated after attach_sv() is called
788 or as a side-effect of remap() with immediate sv param
789 */
790 const bvector_type& get_bv_zero() const { return bv_zero_; }
791
792 /** Perform remapping (Image function) (based on attached translation table)
793
794 \param bv_in - input set, defined as a bit-vector
795 \param bv_out - output set as a bit-vector
796
797 @sa attach_sv
798 */
799 void remap(const bvector_type& bv_in,
800 bvector_type& bv_out);
801
802 /** Perform remapping (Image function)
803
804 \param bv_in - input set, defined as a bit-vector
805 \param sv_brel - binary relation (translation table) sparse vector
806 \param bv_out - output set as a bit-vector
807 */
808 void remap(const bvector_type& bv_in,
809 const SV& sv_brel,
810 bvector_type& bv_out);
811
812 /** Remap single set element
813
814 \param id_from - input value
815 \param sv_brel - translation table sparse vector
816 \param id_to - out value
817
818 \return - true if value was found and remapped
819 */
820 bool remap(size_type id_from, const SV& sv_brel, size_type& id_to);
821
822
823 /** Run remap transformation
824
825 \param bv_in - input set, defined as a bit-vector
826 \param sv_brel - translation table sparse vector
827 \param bv_out - output set as a bit-vector
828
829 @sa remap
830 */
831 void run(const bvector_type& bv_in,
832 const SV& sv_brel,
833 bvector_type& bv_out)
834 {
835 remap(bv_in, sv_brel, bv_out);
836 }
837
838 /** Attach a translation table vector for remapping (Image function)
839
840 \param sv_brel - binary relation sparse vector pointer
841 (pass NULL to detach)
842 \param compute_stats - flag to perform computation of some statistics
843 later used in remapping. This only make sense
844 for series of remappings on the same translation
845 vector.
846 @sa remap
847 */
848 void attach_sv(const SV* sv_brel, bool compute_stats = false);
849
850
851protected:
852 void one_pass_run(const bvector_type& bv_in,
853 const SV& sv_brel,
854 bvector_type& bv_out);
855
856 /// @internal
857 template<unsigned BSIZE>
863
865 {
866 sv_g_size = 1024 * 8
867 };
869
870
871protected:
873 void operator=(const set2set_11_transform&) = delete;
874
875protected:
876 const SV* sv_ptr_; ///< current translation table vector
877 gather_buffer_type* gb_; ///< intermediate buffers
878 bvector_type bv_product_;///< temp vector
879
880 bool have_stats_; ///< flag of statistics presense
881 bvector_type bv_zero_; ///< bit-vector for zero elements
882
884};
885
886
887
888//----------------------------------------------------------------------------
889//
890//----------------------------------------------------------------------------
891
892template<typename SV>
894: sv_ptr_(0), gb_(0), have_stats_(false)
895{
896 gb_ = (gather_buffer_type*)::malloc(sizeof(gather_buffer_type));
897 if (!gb_)
898 {
899 SV::throw_bad_alloc();
900 }
901}
902
903//----------------------------------------------------------------------------
904
905template<typename SV>
907{
908 if (gb_)
909 ::free(gb_);
910}
911
912
913//----------------------------------------------------------------------------
914
915template<typename SV>
916void set2set_11_transform<SV>::attach_sv(const SV* sv_brel, bool compute_stats)
917{
918 sv_ptr_ = sv_brel;
919 if (!sv_ptr_)
920 {
921 have_stats_ = false;
922 }
923 else
924 {
925 if (sv_brel->empty() || !compute_stats)
926 return; // nothing to do
927 const bvector_type* bv_non_null = sv_brel->get_null_bvector();
928 if (bv_non_null)
929 return; // already have 0 elements vector
930
931
932 typename bvector_type::mem_pool_guard mp_g_z;
933 mp_g_z.assign_if_not_set(pool_, bv_zero_);
934
936 scanner.find_zero(*sv_brel, bv_zero_);
937 have_stats_ = true;
938 }
939}
940
941//----------------------------------------------------------------------------
942
943template<typename SV>
945 const SV& sv_brel,
946 size_type& id_to)
947{
948 if (sv_brel.empty())
949 return false; // nothing to do
950
951 const bvector_type* bv_non_null = sv_brel.get_null_bvector();
952 if (bv_non_null)
953 {
954 if (!bv_non_null->test(id_from))
955 return false;
956 }
957 size_type idx = sv_brel.translate_address(id_from);
958 id_to = sv_brel.get(idx);
959 return true;
960}
961
962//----------------------------------------------------------------------------
963
964template<typename SV>
966 const SV& sv_brel,
967 bvector_type& bv_out)
968{
969 typename bvector_type::mem_pool_guard mp_g_out, mp_g_p, mp_g_z;
970 mp_g_out.assign_if_not_set(pool_, bv_out);
971 mp_g_p.assign_if_not_set(pool_, bv_product_);
972 mp_g_z.assign_if_not_set(pool_, bv_zero_);
973
974 attach_sv(&sv_brel);
975
976 remap(bv_in, bv_out);
977
978 attach_sv(0); // detach translation table
979}
980
981template<typename SV>
983 bvector_type& bv_out)
984{
985 BM_ASSERT(sv_ptr_);
986
987 bv_out.clear();
988
989 if (sv_ptr_ == 0 || sv_ptr_->empty())
990 return; // nothing to do
991
992 bv_out.init(); // just in case to "fast set" later
993
994 typename bvector_type::mem_pool_guard mp_g_out, mp_g_p, mp_g_z;
995 mp_g_out.assign_if_not_set(pool_, bv_out);
996 mp_g_p.assign_if_not_set(pool_, bv_product_);
997 mp_g_z.assign_if_not_set(pool_, bv_zero_);
998
999
1000 const bvector_type* enum_bv;
1001
1002 const bvector_type * bv_non_null = sv_ptr_->get_null_bvector();
1003 if (bv_non_null)
1004 {
1005 // TODO: optimize with 2-way ops
1006 bv_product_ = bv_in;
1007 bv_product_.bit_and(*bv_non_null);
1008 enum_bv = &bv_product_;
1009 }
1010 else // non-NULL vector is not available
1011 {
1012 enum_bv = &bv_in;
1013 // if we have any elements mapping into "0" on the other end
1014 // we map it once (chances are there are many duplicates)
1015 //
1016
1017 if (have_stats_) // pre-attached translation statistics
1018 {
1019 bv_product_ = bv_in;
1020 size_type cnt1 = bv_product_.count();
1021 bv_product_.bit_sub(bv_zero_);
1022 size_type cnt2 = bv_product_.count();
1023
1024 BM_ASSERT(cnt2 <= cnt1);
1025
1026 if (cnt1 != cnt2) // mapping included 0 elements
1027 bv_out.set_bit_no_check(0);
1028
1029 enum_bv = &bv_product_;
1030 }
1031 }
1032
1033
1034
1035 size_type buf_cnt, nb_old, nb;
1036 buf_cnt = nb_old = 0;
1037
1038 typename bvector_type::enumerator en(enum_bv->first());
1039 for (; en.valid(); ++en)
1040 {
1041 typename SV::size_type idx = *en;
1042 idx = sv_ptr_->translate_address(idx);
1043
1044 nb = unsigned(idx >> bm::set_block_shift);
1045 if (nb != nb_old) // new blocks
1046 {
1047 if (buf_cnt)
1048 {
1049 sv_ptr_->gather(&gb_->buffer_[0], &gb_->gather_idx_[0], buf_cnt, BM_SORTED_UNIFORM);
1050 bv_out.set(&gb_->buffer_[0], buf_cnt, BM_SORTED);
1051 buf_cnt = 0;
1052 }
1053 nb_old = nb;
1054 gb_->gather_idx_[buf_cnt++] = idx;
1055 }
1056 else
1057 {
1058 gb_->gather_idx_[buf_cnt++] = idx;
1059 }
1060
1061 if (buf_cnt == sv_g_size)
1062 {
1063 sv_ptr_->gather(&gb_->buffer_[0], &gb_->gather_idx_[0], buf_cnt, BM_SORTED_UNIFORM);
1064 bv_out.set(&gb_->buffer_[0], buf_cnt, bm::BM_SORTED);
1065 buf_cnt = 0;
1066 }
1067 } // for en
1068 if (buf_cnt)
1069 {
1070 sv_ptr_->gather(&gb_->buffer_[0], &gb_->gather_idx_[0], buf_cnt, BM_SORTED_UNIFORM);
1071 bv_out.set(&gb_->buffer_[0], buf_cnt, bm::BM_SORTED);
1072 }
1073}
1074
1075
1076//----------------------------------------------------------------------------
1077
1078template<typename SV>
1080 const SV& sv_brel,
1081 bvector_type& bv_out)
1082{
1083 if (sv_brel.empty())
1084 return; // nothing to do
1085
1086 bv_out.init();
1087
1088 typename SV::const_iterator it = sv_brel.begin();
1089 for (; it.valid(); ++it)
1090 {
1091 typename SV::value_type t_id = *it;
1092 size_type idx = it.pos();
1093 if (bv_in.test(idx))
1094 {
1095 bv_out.set_bit_no_check(t_id);
1096 }
1097 } // for
1098}
1099
1100
1101//----------------------------------------------------------------------------
1102//
1103//----------------------------------------------------------------------------
1104
1105template<typename SV>
1107{
1108 mask_set_ = false;
1109 mask_from_ = mask_to_ = bm::id_max;
1110
1111 bound_sv_ = 0;
1112 effective_str_max_ = 0;
1113}
1114
1115//----------------------------------------------------------------------------
1116
1117template<typename SV>
1118void sparse_vector_scanner<SV>::bind(const SV& sv, bool sorted)
1119{
1120 bound_sv_ = &sv;
1121 if (sorted)
1122 {
1123 size_type sv_sz = sv.size();
1124 BM_ASSERT(sv_sz);
1125 size_type total_nb = sv_sz / bm::gap_max_bits + 1;
1126 effective_str_max_ = sv.effective_vector_max();
1127
1128 block0_elements_cache_.resize(total_nb, effective_str_max_+1);
1129 block0_elements_cache_.set_zero();
1130
1131 block3_elements_cache_.resize(total_nb * 3, effective_str_max_+1);
1132 block3_elements_cache_.set_zero();
1133
1134 // fill in elements cache
1135 for (size_type i = 0; i < sv_sz; i+= bm::gap_max_bits)
1136 {
1137 size_type nb = (i >> bm::set_block_shift);
1138 value_type* s0 = block0_elements_cache_.row(nb);
1139 sv.get(i, s0, size_type(block0_elements_cache_.cols()));
1140
1141 for (size_type k = 0; k < 3; ++k)
1142 {
1143 value_type* s1 = block3_elements_cache_.row(nb * 3 + k);
1144 size_type idx = i + (k+1) * bm::sub_block3_size;
1145 sv.get(idx, s1, size_type(block3_elements_cache_.cols()));
1146 } // for k
1147 } // for i
1148 }
1149 // pre-calculate vector plain masks
1150 //
1151 for (unsigned i = 0; i < SV::max_vector_size; ++i)
1152 {
1153 vector_plain_masks_[i] = sv.get_plains_mask(i);
1154 } // for i
1155
1156}
1157
1158//----------------------------------------------------------------------------
1159
1160template<typename SV>
1162{
1163 bound_sv_ = 0;
1164 effective_str_max_ = 0;
1165}
1166
1167//----------------------------------------------------------------------------
1168
1169template<typename SV>
1171 typename SV::bvector_type& bv_out)
1172{
1173 if (sv.size() == 0)
1174 {
1175 bv_out.clear();
1176 return;
1177 }
1178 find_nonzero(sv, bv_out);
1179 if (sv.is_compressed())
1180 {
1181 bv_out.invert();
1182 bv_out.set_range(sv.effective_size(), bm::id_max - 1, false);
1183 decompress(sv, bv_out);
1184 }
1185 else
1186 {
1187 invert(sv, bv_out);
1188 }
1189 correct_nulls(sv, bv_out);
1190}
1191
1192//----------------------------------------------------------------------------
1193
1194template<typename SV>
1195void sparse_vector_scanner<SV>::invert(const SV& sv, typename SV::bvector_type& bv_out)
1196{
1197 if (sv.size() == 0)
1198 {
1199 bv_out.clear();
1200 return;
1201 }
1202 // TODO: find a better algorithm (NAND?)
1203 bv_out.invert();
1204 const bvector_type* bv_null = sv.get_null_bvector();
1205 if (bv_null) // correct result to only use not NULL elements
1206 bv_out &= *bv_null;
1207 else
1208 {
1209 // TODO: use the shorter range to clear the tail
1210 bv_out.set_range(sv.size(), bm::id_max - 1, false);
1211 }
1212}
1213
1214//----------------------------------------------------------------------------
1215
1216template<typename SV>
1218 typename SV::bvector_type& bv_out)
1219{
1220 const bvector_type* bv_null = sv.get_null_bvector();
1221 if (bv_null) // correct result to only use not NULL elements
1222 bv_out.bit_and(*bv_null);
1223}
1224
1225//----------------------------------------------------------------------------
1226
1227template<typename SV>
1229 typename SV::value_type value,
1230 typename SV::bvector_type& bv_out,
1231 typename SV::size_type search_limit)
1232{
1233 if (sv.empty())
1234 return false; // nothing to do
1235
1236 if (!value)
1237 {
1238 find_zero(sv, bv_out);
1239 return bv_out.any();
1240 }
1241 agg_.reset();
1242
1243 bool found = prepare_and_sub_aggregator(sv, value);
1244 if (!found)
1245 {
1246 bv_out.clear();
1247 return found;
1248 }
1249
1250 bool any = (search_limit == 1);
1251 found = agg_.combine_and_sub(bv_out, any);
1252 agg_.reset();
1253 return found;
1254}
1255
1256//----------------------------------------------------------------------------
1257
1258template<typename SV>
1260 typename SV::value_type value,
1261 size_type& idx)
1262{
1263 if (sv.empty())
1264 return false; // nothing to do
1265
1266 BM_ASSERT(value); // cannot handle zero values
1267 if (!value)
1268 return false;
1269
1270 agg_.reset();
1271 bool found = prepare_and_sub_aggregator(sv, value);
1272 if (!found)
1273 return found;
1274 found = agg_.find_first_and_sub(idx);
1275 agg_.reset();
1276 return found;
1277}
1278
1279
1280//----------------------------------------------------------------------------
1281
1282template<typename SV>
1284 const typename SV::value_type* str,
1285 size_type& idx,
1286 bool remaped)
1287{
1288 if (sv.empty())
1289 return false; // nothing to do
1290 BM_ASSERT(*str);
1291
1292 if (!*str)
1293 return false;
1294
1295 agg_.reset();
1296 unsigned common_prefix_len = 0;
1297
1298 if (mask_set_)
1299 {
1300 agg_.set_range_hint(mask_from_, mask_to_);
1301 common_prefix_len = sv.common_prefix_length(mask_from_, mask_to_);
1302 }
1303
1304 if (remaped)
1305 {
1306 str = remap_value_vect_;
1307 }
1308 else
1309 {
1310 if (sv.is_remap() && str != remap_value_vect_)
1311 {
1312 bool r = sv.remap_tosv(remap_value_vect_, SV::max_vector_size, str);
1313 if (!r)
1314 return r;
1315 str = remap_value_vect_;
1316 }
1317 }
1318
1319 bool found = prepare_and_sub_aggregator(sv, str, common_prefix_len);
1320 if (!found)
1321 return found;
1322
1323 found = agg_.find_first_and_sub(idx);
1324 agg_.reset();
1325 return found;
1326}
1327
1328//----------------------------------------------------------------------------
1329
1330template<typename SV>
1332 const typename SV::value_type* str,
1333 unsigned octet_start)
1334{
1335 unsigned char bits[64];
1336
1337 int len = 0;
1338 for (; str[len] != 0; ++len)
1339 {}
1340 BM_ASSERT(len);
1341
1342 // use reverse order (faster for sorted arrays)
1343 for (int octet_idx = len-1; octet_idx >= 0; --octet_idx)
1344 {
1345 if (unsigned(octet_idx) < octet_start) // common prefix
1346 break;
1347
1348 unsigned value = unsigned(str[octet_idx]) & 0xFF;
1349 BM_ASSERT(value != 0);
1350
1351 bm::id64_t plains_mask;
1352 if (&sv == bound_sv_)
1353 plains_mask = vector_plain_masks_[octet_idx];
1354 else
1355 plains_mask = sv.get_plains_mask(unsigned(octet_idx));
1356
1357 if ((value & plains_mask) != value) // pre-screen for impossible values
1358 return false; // found non-existing plain
1359
1360 // prep the lists for combined AND-SUB aggregator
1361 //
1362 unsigned short bit_count_v = bm::bitscan(value, bits);
1363 for (unsigned i = 0; i < bit_count_v; ++i)
1364 {
1365 unsigned bit_idx = bits[i];
1366 BM_ASSERT(value & (value_type(1) << bit_idx));
1367 unsigned plain_idx = (unsigned(octet_idx) * 8) + bit_idx;
1368 const bvector_type* bv = sv.get_plain(plain_idx);
1369 agg_.add(bv);
1370 } // for i
1371
1372 unsigned vinv = unsigned(value);
1373 if (bm::conditional<sizeof(value_type) == 1>::test())
1374 {
1375 vinv ^= 0xFF;
1376 }
1377 else // 2-byte char
1378 {
1379 BM_ASSERT(sizeof(value_type) == 2);
1380 vinv ^= 0xFFFF;
1381 }
1382 // exclude the octet bits which are all not set (no vectors)
1383 vinv &= unsigned(plains_mask);
1384 for (unsigned octet_plain = (unsigned(octet_idx) * 8); vinv; vinv &= vinv-1)
1385 {
1386 bm::id_t t = vinv & -vinv;
1387 unsigned bit_idx = bm::word_bitcount(t - 1);
1388 unsigned plain_idx = octet_plain + bit_idx;
1389 const bvector_type* bv = sv.get_plain(plain_idx);
1390 BM_ASSERT(bv);
1391 agg_.add(bv, 1); // agg to SUB group
1392 } // for
1393 } // for octet_idx
1394
1395 // add all vectors above string len to the SUB operation group
1396 //
1397 unsigned plain_idx = unsigned(len * 8) + 1;
1398 typename SV::size_type plains;
1399 if (&sv == bound_sv_)
1400 plains = effective_str_max_ * unsigned(sizeof(value_type)) * 8;
1401 else
1402 plains = sv.plains();
1403 for (; plain_idx < plains; ++plain_idx)
1404 {
1405 bvector_type_const_ptr bv = sv.get_plain(plain_idx);
1406 if (bv)
1407 agg_.add(bv, 1); // agg to SUB group
1408 } // for
1409 return true;
1410}
1411
1412//----------------------------------------------------------------------------
1413
1414template<typename SV>
1416 typename SV::value_type value)
1417{
1418 unsigned char bits[sizeof(value) * 8];
1419 unsigned short bit_count_v = bm::bitscan(value, bits);
1420 BM_ASSERT(bit_count_v);
1421 const value_type mask1 = 1;
1422
1423 // prep the lists for combined AND-SUB aggregator
1424 // (backward order has better chance for bit reduction on AND)
1425 //
1426 for (unsigned i = bit_count_v; i > 0; --i)
1427 {
1428 unsigned bit_idx = bits[i-1];
1429 BM_ASSERT(value & (mask1 << bit_idx));
1430 const bvector_type* bv = sv.get_plain(bit_idx);
1431 if (bv)
1432 agg_.add(bv);
1433 else
1434 return false;
1435 }
1436
1437 unsigned sv_plains = sv.effective_plains();
1438 for (unsigned i = 0; i < sv_plains; ++i)
1439 {
1440 bvector_type_const_ptr bv = sv.get_plain(i);
1441 value_type mask = mask1 << i;
1442 if (bv && !(value & mask))
1443 agg_.add(bv, 1); // agg to SUB group
1444 } // for i
1445 return true;
1446}
1447
1448
1449//----------------------------------------------------------------------------
1450
1451template<typename SV>
1453 typename SV::value_type value,
1454 typename SV::bvector_type& bv_out)
1455{
1456 if (sv.empty())
1457 return; // nothing to do
1458
1459 if (!value)
1460 {
1461 find_zero(sv, bv_out);
1462 return;
1463 }
1464
1465 unsigned char bits[sizeof(value) * 8];
1466 unsigned short bit_count_v = bm::bitscan(value, bits);
1467 BM_ASSERT(bit_count_v);
1468
1469 // aggregate AND all matching vectors
1470 {
1471 const bvector_type* bv_plain = sv.get_plain(bits[--bit_count_v]);
1472 if (bv_plain)
1473 bv_out = *bv_plain;
1474 else // plain not found
1475 {
1476 bv_out.clear();
1477 return;
1478 }
1479 }
1480 for (unsigned i = 0; i < bit_count_v; ++i)
1481 {
1482 const bvector_type* bv_plain = sv.get_plain(bits[i]);
1483 if (bv_plain)
1484 bv_out &= *bv_plain;
1485 else // mandatory plain not found - empty result!
1486 {
1487 bv_out.clear();
1488 return;
1489 }
1490 } // for i
1491
1492 // SUB all other plains
1493 unsigned sv_plains = sv.effective_plains();
1494 for (unsigned i = 0; (i < sv_plains) && value; ++i)
1495 {
1496 const bvector_type* bv_plain = sv.get_plain(i);
1497 if (bv_plain && !(value & (value_type(1) << i)))
1498 bv_out -= *bv_plain;
1499 }
1500}
1501
1502//----------------------------------------------------------------------------
1503
1504template<typename SV>
1505bool sparse_vector_scanner<SV>::find_eq_str(const typename SV::value_type* str,
1506 typename SV::size_type& pos)
1507{
1508 BM_ASSERT(bound_sv_);
1509 return this->find_eq_str(*bound_sv_, str, pos);
1510}
1511
1512//----------------------------------------------------------------------------
1513
1514template<typename SV>
1516 const typename SV::value_type* str,
1517 typename SV::size_type& pos)
1518{
1519 bool found = false;
1520 if (sv.empty())
1521 return found;
1522 if (*str)
1523 {
1524 bool remaped = false;
1525 if (bm::conditional<SV::is_remap_support::value>::test()) // test remapping trait
1526 {
1527 if (sv.is_remap() && str != remap_value_vect_)
1528 {
1529 remaped = sv.remap_tosv(remap_value_vect_, SV::max_vector_size, str);
1530 if (!remaped)
1531 return remaped;
1532 str = remap_value_vect_;
1533 }
1534 }
1535
1536 size_type found_pos;
1537 found = find_first_eq(sv, str, found_pos, remaped);
1538 if (found)
1539 {
1540 pos = found_pos;
1541 if (bm::conditional<SV::is_rsc_support::value>::test()) // test rank/select trait
1542 {
1543 if (sv.is_compressed()) // if compressed vector - need rank translation
1544 found = sv.find_rank(found_pos + 1, pos);
1545 }
1546 }
1547 }
1548 else // search for zero value
1549 {
1550 // TODO: implement optimized version which would work witout temp vector
1551 typename SV::bvector_type bv_zero;
1552 find_zero(sv, bv_zero);
1553 found = bv_zero.find(pos);
1554 }
1555 return found;
1556}
1557
1558//----------------------------------------------------------------------------
1559
1560template<typename SV>
1562 const typename SV::value_type* str,
1563 typename SV::size_type& pos)
1564{
1565 bool found = false;
1566 if (sv.empty())
1567 return found;
1568
1569 if (*str)
1570 {
1571 bool remaped = false;
1572 // test search pre-condition based on remap tables
1574 {
1575 if (sv.is_remap() && str != remap_value_vect_)
1576 {
1577 remaped = sv.remap_tosv(
1578 remap_value_vect_, SV::max_vector_size, str);
1579 if (!remaped)
1580 return remaped;
1581 }
1582 }
1583
1584 reset_search_range();
1585
1586 // narrow down the search
1587 const unsigned min_distance_cutoff = bm::gap_max_bits + bm::gap_max_bits / 2;
1588 size_type l, r, dist;
1589 l = 0; r = sv.size()-1;
1590 size_type found_pos;
1591
1592 // binary search to narrow down the search window
1593 while (l <= r)
1594 {
1595 dist = r - l;
1596 if (dist < min_distance_cutoff)
1597 {
1598 // we are in an narrow <2 blocks window, but still may be in two
1599 // different neighboring blocks, lets try to narrow
1600 // to exactly one block
1601
1602 size_type nb_l = (l >> bm::set_block_shift);
1603 size_type nb_r = (r >> bm::set_block_shift);
1604 if (nb_l != nb_r)
1605 {
1606 size_type mid = nb_r * bm::gap_max_bits;
1607 if (mid < r)
1608 {
1609 int cmp = this->compare_str(sv, mid, str);
1610 if (cmp < 0) // mid < str
1611 l = mid;
1612 else
1613 r = mid-(cmp!=0); // branchless if (cmp==0) r=mid;
1614 BM_ASSERT(l < r);
1615 }
1616 nb_l = unsigned(l >> bm::set_block_shift);
1617 nb_r = unsigned(r >> bm::set_block_shift);
1618 }
1619
1620 if (nb_l == nb_r)
1621 {
1622 size_type max_nb = sv.size() >> bm::set_block_shift;
1623 if (nb_l != max_nb)
1624 {
1625 // linear in-place fixed depth scan to identify the sub-range
1627 int cmp = this->compare_str(sv, mid, str);
1628 if (cmp < 0)
1629 {
1630 l = mid;
1631 mid = nb_r * bm::gap_max_bits + bm::sub_block3_size * 2;
1632 cmp = this->compare_str(sv, mid, str);
1633 if (cmp < 0)
1634 {
1635 l = mid;
1636 mid = nb_r * bm::gap_max_bits + bm::sub_block3_size * 3;
1637 cmp = this->compare_str(sv, mid, str);
1638 if (cmp < 0)
1639 l = mid;
1640 else
1641 r = mid;
1642 }
1643 else
1644 {
1645 r = mid;
1646 }
1647 }
1648 else
1649 {
1650 r = mid;
1651 }
1652 }
1653 }
1654
1655 set_search_range(l, r);
1656 break;
1657 }
1658
1659 typename SV::size_type mid = dist/2+l;
1660 size_type nb = (mid >> bm::set_block_shift);
1661 mid = nb * bm::gap_max_bits;
1662 if (mid <= l)
1663 {
1664 if (nb == 0 && r > bm::gap_max_bits)
1665 mid = bm::gap_max_bits;
1666 else
1667 mid = dist / 2 + l;
1668 }
1669 BM_ASSERT(mid > l);
1670
1671 int cmp = this->compare_str(sv, mid, str);
1672 if (cmp == 0)
1673 {
1674 found_pos = mid;
1675 found = true;
1676 set_search_range(l, mid);
1677 break;
1678 }
1679 if (cmp < 0)
1680 l = mid+1;
1681 else
1682 r = mid-1;
1683 } // while
1684
1685 // use linear search (range is set)
1686 found = find_first_eq(sv, str, found_pos, remaped);
1687 if (found)
1688 {
1689 pos = found_pos;
1690 if (bm::conditional<SV::is_rsc_support::value>::test()) // test rank/select trait
1691 {
1692 if (sv.is_compressed()) // if compressed vector - need rank translation
1693 found = sv.find_rank(found_pos + 1, pos);
1694 }
1695 }
1696 reset_search_range();
1697 }
1698 else // search for zero value
1699 {
1700 // TODO: implement optimized version which would work without temp vector
1701 typename SV::bvector_type bv_zero;
1702 find_zero(sv, bv_zero);
1703 found = bv_zero.find(pos);
1704 }
1705 return found;
1706}
1707
1708//----------------------------------------------------------------------------
1709
1710template<typename SV>
1711bool sparse_vector_scanner<SV>::bfind_eq_str(const typename SV::value_type* str,
1712 typename SV::size_type& pos)
1713{
1714 BM_ASSERT(bound_sv_);
1715 return bfind_eq_str(*bound_sv_, str, pos);
1716}
1717
1718//----------------------------------------------------------------------------
1719
1720template<typename SV>
1722 const typename SV::value_type val,
1723 typename SV::size_type& pos)
1724{
1725 int cmp;
1726 size_type l = 0, r = sv.size();
1727 if (l == r) // empty vector
1728 {
1729 pos = 0;
1730 return false;
1731 }
1732 --r;
1733
1734 // check initial boundary conditions if insert point is at tail/head
1735 cmp = this->compare(sv, l, val); // left (0) boundary check
1736 if (cmp > 0) // vect[x] > str
1737 {
1738 pos = 0;
1739 return false;
1740 }
1741 if (cmp == 0)
1742 {
1743 pos = 0;
1744 return true;
1745 }
1746 cmp = this->compare(sv, r, val); // right(size-1) boundary check
1747 if (cmp == 0)
1748 {
1749 pos = r;
1750 // back-scan to rewind all duplicates
1751 // TODO: adapt one-sided binary search to traverse large platos
1752 for (; r >= 0; --r)
1753 {
1754 cmp = this->compare(sv, r, val);
1755 if (cmp != 0)
1756 return true;
1757 pos = r;
1758 } // for i
1759 return true;
1760 }
1761 if (cmp < 0) // vect[x] < str
1762 {
1763 pos = r+1;
1764 return false;
1765 }
1766
1767 size_type dist = r - l;
1768 if (dist < linear_cutoff1)
1769 {
1770 for (; l <= r; ++l)
1771 {
1772 cmp = this->compare(sv, l, val);
1773 if (cmp == 0)
1774 {
1775 pos = l;
1776 return true;
1777 }
1778 if (cmp > 0)
1779 {
1780 pos = l;
1781 return false;
1782 }
1783 } // for
1784 }
1785
1786 while (l <= r)
1787 {
1788 size_type mid = (r-l)/2+l;
1789 cmp = this->compare(sv, mid, val);
1790 if (cmp == 0)
1791 {
1792 pos = mid;
1793 // back-scan to rewind all duplicates
1794 for (size_type i = mid-1; i >= 0; --i)
1795 {
1796 cmp = this->compare(sv, i, val);
1797 if (cmp != 0)
1798 return true;
1799 pos = i;
1800 } // for i
1801 pos = 0;
1802 return true;
1803 }
1804 if (cmp < 0)
1805 l = mid+1;
1806 else
1807 r = mid-1;
1808
1809 dist = r - l;
1810 if (dist < linear_cutoff2) // do linear scan here
1811 {
1812 typename SV::const_iterator it(&sv, l);
1813 for (; it.valid(); ++it, ++l)
1814 {
1815 typename SV::value_type sv_value = it.value();
1816 if (sv_value == val)
1817 {
1818 pos = l;
1819 return true;
1820 }
1821 if (sv_value > val) // vect[x] > val
1822 {
1823 pos = l;
1824 return false;
1825 }
1826 } // for it
1827 BM_ASSERT(0);
1828 pos = l;
1829 return false;
1830 }
1831 } // while
1832
1833 BM_ASSERT(0);
1834 return false;
1835}
1836
1837
1838//----------------------------------------------------------------------------
1839
1840template<typename SV>
1842 const SV& sv,
1843 const typename SV::value_type* str,
1844 typename SV::size_type& pos)
1845{
1846 int cmp;
1847 size_type l = 0, r = sv.size();
1848
1849 if (l == r) // empty vector
1850 {
1851 pos = 0;
1852 return false;
1853 }
1854 --r;
1855
1856 // check initial boundary conditions if insert point is at tail/head
1857 cmp = this->compare_str(sv, l, str); // left (0) boundary check
1858 if (cmp > 0) // vect[x] > str
1859 {
1860 pos = 0;
1861 return false;
1862 }
1863 if (cmp == 0)
1864 {
1865 pos = 0;
1866 return true;
1867 }
1868 cmp = this->compare_str(sv, r, str); // right(size-1) boundary check
1869 if (cmp == 0)
1870 {
1871 pos = r;
1872 // back-scan to rewind all duplicates
1873 // TODO: adapt one-sided binary search to traverse large platos
1874 for (; r >= 0; --r)
1875 {
1876 cmp = this->compare_str(sv, r, str);
1877 if (cmp != 0)
1878 return true;
1879 pos = r;
1880 } // for i
1881 return true;
1882 }
1883 if (cmp < 0) // vect[x] < str
1884 {
1885 pos = r+1;
1886 return false;
1887 }
1888
1889 size_type dist = r - l;
1890 if (dist < linear_cutoff1)
1891 {
1892 for (; l <= r; ++l)
1893 {
1894 cmp = this->compare_str(sv, l, str);
1895 if (cmp == 0)
1896 {
1897 pos = l;
1898 return true;
1899 }
1900 if (cmp > 0)
1901 {
1902 pos = l;
1903 return false;
1904 }
1905 } // for
1906 }
1907 while (l <= r)
1908 {
1909 size_type mid = (r-l)/2+l;
1910 cmp = this->compare_str(sv, mid, str);
1911 if (cmp == 0)
1912 {
1913 pos = mid;
1914 // back-scan to rewind all duplicates
1915 for (size_type i = mid-1; i >= 0; --i)
1916 {
1917 cmp = this->compare_str(sv, i, str);
1918 if (cmp != 0)
1919 return true;
1920 pos = i;
1921 } // for i
1922 pos = 0;
1923 return true;
1924 }
1925 if (cmp < 0)
1926 l = mid+1;
1927 else
1928 r = mid-1;
1929
1930 dist = r - l;
1931 if (dist < linear_cutoff2) // do linear scan here
1932 {
1933 hmatr_.init();
1934
1935 dist = sv.decode(hmatr_, l, dist+1);
1936 for (unsigned i = 0; i < dist; ++i, ++l)
1937 {
1938 const typename SV::value_type* hm_str = hmatr_.row(i);
1939 cmp = ::strcmp(hm_str, str);
1940 if (cmp == 0)
1941 {
1942 pos = l;
1943 return true;
1944 }
1945 if (cmp > 0) // vect[x] > str
1946 {
1947 pos = l;
1948 return false;
1949 }
1950 }
1951 cmp = this->compare_str(sv, l, str);
1952 if (cmp > 0) // vect[x] > str
1953 {
1954 pos = l;
1955 return false;
1956 }
1957 BM_ASSERT(0);
1958 pos = l;
1959 return false;
1960 }
1961 } // while
1962
1963 BM_ASSERT(0);
1964 return false;
1965}
1966
1967
1968//----------------------------------------------------------------------------
1969
1970template<typename SV>
1972 size_type idx,
1973 const value_type* str)
1974{
1975 if (bound_sv_ == &sv)
1976 {
1977 size_type nb = (idx >> bm::set_block_shift);
1978 size_type nbit = (idx & bm::set_block_mask);
1979 if (nbit == 0) // access to sentinel, first block element
1980 {
1981 value_type* s0 = block0_elements_cache_.row(nb);
1982 if (*s0 == 0) // uninitialized element
1983 {
1984 sv.get(idx, s0, size_type(block0_elements_cache_.cols()));
1985 }
1986 int res = 0;
1987 for (unsigned i = 0; i < block0_elements_cache_.cols(); ++i)
1988 {
1989 char octet = str[i]; char value = s0[i];
1990 res = (value > octet) - (value < octet);
1991 if (res || !octet)
1992 break;
1993 } // for i
1994 BM_ASSERT(res == sv.compare(idx, str));
1995 return res;
1996 }
1997 else
1998 {
1999 if (nbit % bm::sub_block3_size == 0) // TODO: use AND mask here
2000 {
2001 size_type k = nbit / bm::sub_block3_size - 1;
2002 value_type* s1 = block3_elements_cache_.row(nb * 3 + k);
2003 int res = 0;
2004 for (unsigned i = 0; i < block3_elements_cache_.cols(); ++i)
2005 {
2006 char octet = str[i]; char value = s1[i];
2007 res = (value > octet) - (value < octet);
2008 if (res || !octet)
2009 break;
2010 } // for i
2011 BM_ASSERT(res == sv.compare(idx, str));
2012 return res;
2013 }
2014 }
2015 }
2016 return sv.compare(idx, str);
2017}
2018
2019//----------------------------------------------------------------------------
2020
2021template<typename SV>
2023 size_type idx,
2024 const value_type val) BMNOEXCEPT
2025{
2026 // TODO: implement sentinel elements cache (similar to compare_str())
2027 return sv.compare(idx, val);
2028}
2029
2030//----------------------------------------------------------------------------
2031
2032template<typename SV>
2034 typename SV::value_type value,
2035 typename SV::bvector_type& bv_out)
2036{
2037 if (sv.empty())
2038 {
2039 bv_out.clear();
2040 return; // nothing to do
2041 }
2042 if (!value)
2043 {
2044 find_zero(sv, bv_out);
2045 return;
2046 }
2047
2048 find_eq_with_nulls(sv, value, bv_out, 0);
2049
2050 decompress(sv, bv_out);
2051 correct_nulls(sv, bv_out);
2052}
2053
2054//----------------------------------------------------------------------------
2055
2056template<typename SV>
2058 typename SV::value_type value,
2059 typename SV::size_type& pos)
2060{
2061 if (!value) // zero value - special case
2062 {
2063 bvector_type bv_zero;
2064 find_eq(sv, value, bv_zero);
2065 bool found = bv_zero.find(pos);
2066 return found;
2067 }
2068
2069 size_type found_pos;
2070 bool found = find_first_eq(sv, value, found_pos);
2071 if (found)
2072 {
2073 pos = found_pos;
2074 if (bm::conditional<SV::is_rsc_support::value>::test()) // test rank/select trait
2075 {
2076 if (sv.is_compressed()) // if compressed vector - need rank translation
2077 found = sv.find_rank(found_pos + 1, pos);
2078 }
2079 }
2080 return found;
2081}
2082
2083//----------------------------------------------------------------------------
2084
2085template<typename SV>
2087 typename SV::bvector_type& bv_out)
2088{
2089 agg_.reset(); // in case if previous scan was interrupted
2090 for (unsigned i = 0; i < sv.plains(); ++i)
2091 agg_.add(sv.get_plain(i));
2092 agg_.combine_or(bv_out);
2093 agg_.reset();
2094}
2095
2096//----------------------------------------------------------------------------
2097
2098template<typename SV>
2100 typename SV::bvector_type& bv_out)
2101{
2102 if (!sv.is_compressed())
2103 return; // nothing to do
2104 const bvector_type* bv_non_null = sv.get_null_bvector();
2105 BM_ASSERT(bv_non_null);
2106
2107 // TODO: implement faster decompressor for small result-sets
2108 rank_compr_.decompress(bv_tmp_, *bv_non_null, bv_out);
2109 bv_out.swap(bv_tmp_);
2110}
2111
2112//----------------------------------------------------------------------------
2113
2114template<typename SV>
2116{
2117 BM_ASSERT(from < to);
2118 mask_from_ = from;
2119 mask_to_ = to;
2120 mask_set_ = true;
2121}
2122
2123//----------------------------------------------------------------------------
2124
2125template<typename SV>
2127{
2128 mask_set_ = false;
2129 mask_from_ = mask_to_ = bm::id_max;
2130}
2131
2132
2133//----------------------------------------------------------------------------
2134//
2135//----------------------------------------------------------------------------
2136
2137
2138} // namespace bm
2139
2140#include "bmundef.h"
2141
2142#endif
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include)
Definitions(internal)
#define BMNOEXCEPT
Definition bmdef.h:79
#define BM_ASSERT
Definition bmdef.h:130
#define BM_VECT_ALIGN
Definition bmdef.h:359
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
pre-processor un-defines to avoid global space pollution (internal)
Algorithms for fast aggregation of a group of bit-vectors.
Constant iterator designed to enumerate "ON" bits.
Definition bm.h:600
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Definition bm.h:280
void assign_if_not_set(allocator_pool_type &pool, bvector< Alloc > &bv) BMNOEXCEPT
check if vector has no assigned allocator and set one
Definition bm.h:789
Bitvector Bit-vector container with runtime compression of bits.
Definition bm.h:108
@ opt_none
no optimization
Definition bm.h:131
bvector< Alloc > & invert()
Invert/NEG all bits It should be noted, invert is affected by size() if size is set - it only inverts...
Definition bm.h:2987
void resize(size_type new_size)
Change size of the bvector.
Definition bm.h:2256
bm::bvector< Alloc > & bit_or(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode)
3-operand OR : this := bv1 OR bv2
Definition bm.h:4809
allocator_type::allocator_pool_type allocator_pool_type
Definition bm.h:111
Alloc allocator_type
Definition bm.h:110
bm::bvector< Alloc > & bit_xor(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode)
3-operand XOR : this := bv1 XOR bv2
Definition bm.h:4906
Algorithms for rank compression of bit-vector.
Definition bmalgo.h:408
Integer set to set transformation (functional image in groups theory) https://en.wikipedia....
bvector_type::allocator_type::allocator_pool_type allocator_pool_type
bvector_type bv_zero_
bit-vector for zero elements
bvector_type bv_product_
temp vector
void attach_sv(const SV *sv_brel, bool compute_stats=false)
Attach a translation table vector for remapping (Image function)
allocator_pool_type pool_
const SV * sv_ptr_
current translation table vector
void remap(const bvector_type &bv_in, bvector_type &bv_out)
Perform remapping (Image function) (based on attached translation table)
bool have_stats_
flag of statistics presense
gather_buffer_type * gb_
intermediate buffers
void operator=(const set2set_11_transform &)=delete
void one_pass_run(const bvector_type &bv_in, const SV &sv_brel, bvector_type &bv_out)
SV::bvector_type bvector_type
void run(const bvector_type &bv_in, const SV &sv_brel, bvector_type &bv_out)
Run remap transformation.
gather_buffer< sv_g_size > gather_buffer_type
set2set_11_transform(const set2set_11_transform &)=delete
const bvector_type & get_bv_zero() const
Get read access to zero-elements vector Zero vector gets populated after attach_sv() is called or as ...
algorithms for sparse_vector scan/search
bm::heap_matrix< typename SV::value_type, linear_cutoff2, SV::sv_octet_plains, allocator_type > matrix_search_buf_type
void find_nonzero(const SV &sv, typename SV::bvector_type &bv_out)
Find non-zero elements Output vector is computed as a logical OR (join) of all plains.
bool prepare_and_sub_aggregator(const SV &sv, typename SV::value_type value)
Prepare aggregator for AND-SUB (EQ) search.
bool find_eq_with_nulls(const SV &sv, typename SV::value_type value, typename SV::bvector_type &bv_out, typename SV::size_type search_limit=0)
find value (may include NULL indexes)
allocator_type::allocator_pool_type allocator_pool_type
bool find_first_eq(const SV &sv, typename SV::value_type value, size_type &idx)
find first value (may include NULL indexes)
bvector_type::allocator_type allocator_type
const bvector_type * bvector_type_const_ptr
bool lower_bound_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
lower bound search for an array position
void find_eq_with_nulls_horizontal(const SV &sv, typename SV::value_type value, typename SV::bvector_type &bv_out)
For testing purposes only.
sparse_vector_scanner(const sparse_vector_scanner &)=delete
int compare(const SV &sv, size_type idx, const value_type val) BMNOEXCEPT
compare sv[idx] with input value
void reset_binding() BMNOEXCEPT
reset sparse vector binding
void invert(const SV &sv, typename SV::bvector_type &bv_out)
invert search result ("EQ" to "not EQ")
bool lower_bound(const SV &sv, const typename SV::value_type val, typename SV::size_type &pos)
lower bound search for an array position
void operator=(const sparse_vector_scanner &)=delete
void find_zero(const SV &sv, typename SV::bvector_type &bv_out)
find all sparse vector elements EQ to 0
void decompress(const SV &sv, typename SV::bvector_type &bv_out)
Rank-Select decompression for RSC vectors.
void find_eq(const SV &sv, typename SV::value_type value, typename SV::bvector_type &bv_out)
find all sparse vector elements EQ to search value
bm::dynamic_heap_matrix< value_type, allocator_type > heap_matrix_type
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
void reset_search_range()
reset (disable) search range
bool bfind_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
binary find first sparse vector element (string) Sparse vector must be sorted.
int compare_str(const SV &sv, size_type idx, const value_type *str)
compare sv[idx] with input str
void set_search_range(size_type from, size_type to)
set search boundaries (hint for the aggregator)
void correct_nulls(const SV &sv, typename SV::bvector_type &bv_out)
Exclude possible NULL values from the result vector.
bool find_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
find first sparse vector element (string)
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition bmfunc.h:197
null_support
NULL-able value support.
Definition bmconst.h:214
@ BM_SORTED
input set is sorted (ascending order)
Definition bmconst.h:191
@ BM_SORTED_UNIFORM
sorted and in one block (internal!)
Definition bmconst.h:192
@ use_null
support "non-assigned" or "NULL" logic
Definition bmconst.h:215
@ no_null
do not support NULL values
Definition bmconst.h:216
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
void dynamic_range_clip_low(SV &svect, unsigned low_bit)
Clip dynamic range for signal lower than specified (boost)
void dynamic_range_clip_high(SV &svect, unsigned high_bit)
Clip dynamic range for signal higher than specified.
void sparse_vector_find_mismatch(typename SV1::bvector_type &bv, const SV1 &sv1, const SV2 &sv2, bm::null_support null_proc)
Find mismatch vector, indicating positions of mismatch between two sparse vectors (uses linear scan i...
Definition bm.h:77
void xor_swap(W &x, W &y) BMNOEXCEPT
XOR swap two scalar variables.
Definition bmfunc.h:909
const unsigned id_max
Definition bmconst.h:108
const unsigned set_block_mask
Definition bmconst.h:56
unsigned short bitscan(V w, B *bits) BMNOEXCEPT
Definition bmfunc.h:554
const unsigned sub_block3_size
Definition bmconst.h:123
unsigned long long int id64_t
Definition bmconst.h:34
unsigned int id_t
Definition bmconst.h:37
const unsigned gap_max_bits
Definition bmconst.h:80
const unsigned set_block_shift
Definition bmconst.h:55
bm::bvector bvector_type
ad-hoc conditional expressions
Definition bmutil.h:111
size_type BM_VECT_ALIGN gather_idx_[BSIZE] BM_VECT_ALIGN_ATTR
value_type BM_VECT_ALIGN buffer_[BSIZE] BM_VECT_ALIGN_ATTR