BitMagic-C++
xsample05.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample05.cpp
20*/
21
22/*! \file xsample05.cpp
23 \brief Example: Example on how to use bit-transposed string sparse vector
24
25 Illustrates how to build a sparse vector, serialize it to disk,
26 load back and do search or binary hybrid search.
27
28 \sa bm::str_sparse_vector<>
29
30*/
31
32#include <iostream>
33#include <sstream>
34#include <chrono>
35#include <regex>
36#include <time.h>
37#include <stdio.h>
38
39
40#include <vector>
41#include <map>
42#include <chrono>
43#include <map>
44#include <utility>
45#include <algorithm>
46#include <random>
47
48using namespace std;
49
50//#define BMAVX2OPT
51
52#include "bm.h"
53#include "bmalgo.h"
54#include "bmserial.h"
55#include "bmrandom.h"
56#include "bmstrsparsevec.h"
57#include "bmsparsevec_algo.h"
58#include "bmsparsevec_serial.h"
59#include "bmalgo_similarity.h"
60#include "bmsparsevec_util.h"
61
62
63#include "bmdbg.h"
64#include "bmtimer.h"
65
66/// Print help
67static
69{
70 std::cerr
71 << "BitMagic Dictionary Search Sample (c) 2018" << std::endl
72 << "-idict file-name -- input set file to parse" << std::endl
73 << "-svout spase vector output -- sparse vector name to save" << std::endl
74 << "-svin sparse vector input -- sparse vector file name to load " << std::endl
75 << "-diag -- run diagnostics" << std::endl
76 << "-bench -- run benchmarks" << std::endl
77 << "-timing -- collect timings" << std::endl
78 ;
79}
80
81
82// Arguments
83//
84std::string sv_out_name;
85std::string sv_in_name;
86std::string i_name;
87bool is_diag = false;
88bool is_timing = false;
89bool is_bench = false;
90
91// Parse command line arguments
92static
93int parse_args(int argc, char *argv[])
94{
95 for (int i = 1; i < argc; ++i)
96 {
97 std::string arg = argv[i];
98 if ((arg == "-h") || (arg == "--help"))
99 {
100 show_help();
101 return 0;
102 }
103
104 if (arg == "-svout" || arg == "--svout")
105 {
106 if (i + 1 < argc)
107 {
108 sv_out_name = argv[++i];
109 }
110 else
111 {
112 std::cerr << "Error: -svout requires file name" << std::endl;
113 return 1;
114 }
115 continue;
116 }
117
118 if (arg == "-svin" || arg == "--svin")
119 {
120 if (i + 1 < argc)
121 {
122 sv_in_name = argv[++i];
123 }
124 else
125 {
126 std::cerr << "Error: -svin requires file name" << std::endl;
127 return 1;
128 }
129 continue;
130 }
131
132 if (arg == "-idict" || arg == "--idict" )
133 {
134 if (i + 1 < argc)
135 {
136 i_name = argv[++i];
137 }
138 else
139 {
140 std::cerr << "Error: -idict requires file name" << std::endl;
141 return 1;
142 }
143 continue;
144 }
145
146 if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
147 {
148 is_diag = true;
149 continue;
150 }
151 if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
152 {
153 is_timing = true;
154 continue;
155 }
156 if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
157 {
158 is_bench = true;
159 continue;
160 }
161
162 std::cerr << "Error: unknown argument: " << arg << std::endl;
163 return 1;
164
165
166 } // for i
167 return 0;
168}
169
170
171// Global types
172//
174typedef vector<string> string_vector;
175
176
177
179
180/// Parse the input file and extract dictionary values.
181///
182static
183int load_dict_report(const std::string& fname, string_vector& str_vec)
184{
185 bm::chrono_taker tt1("1. parse input data ", 1, &timing_map);
186
187 std::ifstream fin(fname.c_str(), std::ios::in);
188 if (!fin.good())
189 return -1;
190
191 std::string line;
192 std::regex reg("[|]");
193 std::sregex_token_iterator it_end;
194
195 string trim_chars("\" ");
196
197 for (unsigned i = 0; std::getline(fin, line); ++i)
198 {
199 if (line.empty() || !isdigit(line.front()))
200 continue;
201
202 // regex based tokenizer
203 std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
204 std::vector<std::string> line_vec(it, it_end);
205 if (line_vec.empty())
206 continue;
207 try
208 {
209 // trip the extra chars
210 string& col13 = line_vec.at(13);
211 col13.erase(0, col13.find_first_not_of(trim_chars));
212 col13.erase(col13.find_last_not_of(trim_chars) + 1);
213
214 if (!col13.empty())
215 str_vec.emplace_back(col13);
216 }
217 catch (std::exception&)
218 {
219 // just ignore (not ideal, but ok for a sketch ETL)
220 }
221 if (i % 10000 == 0)
222 {
223 cout << "\rReading input file: " << i << flush;
224 }
225 } // for
226 cout << endl;
227 return 0;
228}
229
230/// Compare STL vector with bit-transposed container to check correcness
231///
232static
233void check_sparse(const str_sparse_vect& str_sv, const string_vector& str_vec)
234{
235 if (str_vec.size() != str_sv.size())
236 throw runtime_error("Error. size() comparison failed!");
237 string s;
238 for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
239 {
240 str_sv.get(i, s);
241 const string& s_control = str_vec[i];
242 if (s != s_control)
243 throw runtime_error("Error. element comparison failed!");
244 } // for
245 std::cout << "Check ok. Dictionary size = " << str_sv.size() << std:: endl;
246}
247
248const unsigned benchmark_max = 15000; // benchmark sampling size
249
250/// Sample a few random terms out of collection
251static
252void pick_benchmark_set(string_vector& bench_vec, string_vector& bench_vec_not_found, const string_vector& str_vec)
253{
254 std::random_device rand_dev;
255 std::mt19937 gen(rand_dev()); // mersenne_twister_engine
256 std::uniform_int_distribution<unsigned> rand_dis(0, unsigned(str_vec.size()-1)); // generate uniform numebrs for [1, vector_max]
257
258 bm::bvector<> bv;
259
260 bench_vec.resize(0);
261 for (unsigned i = 0; i < benchmark_max; ++i)
262 {
263 unsigned idx;
264 while (true)
265 {
266 idx = unsigned(rand_dis(gen));
267 if (bv.test(idx)) // make sure benchmark example is not repeated
268 continue;
269 if (idx < str_vec.size())
270 bench_vec.push_back(str_vec[idx]);
271 break;
272 }
273 bv.set(idx); // mark as set
274
275 // generate not-found case by modifying a letter in an existing sample
276 {
277 string str_nf = str_vec[idx];
278 string::reverse_iterator rit = str_nf.rbegin();
279 string::reverse_iterator rit_end = str_nf.rend();
280 for (; rit != rit_end; ++rit)
281 {
282 char ch = *rit;
283 int a = rand() % 26 + int('A'); // generate random letter
284 ch = char(a);
285 *rit = ch;
286 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), str_nf);
287 if (it == str_vec.end() || *it != str_nf) // check if not found
288 {
289 bench_vec_not_found.push_back(str_nf);
290 break;
291 }
292 } // for rit
293 }
294
295 } // for
296 cout << endl;
297}
298
299static
300void run_benchmark(const str_sparse_vect& str_sv, const string_vector& str_vec)
301{
302 string_vector bench_vec;
303 string_vector bench_vec_not_found; // vector for impossible dictionary items
304
305 pick_benchmark_set(bench_vec, bench_vec_not_found, str_vec);
306
307 bm::bvector<> bv1, bv2, bv3, bv4;
308
309 cout << "Picked " << bench_vec.size() << " / "
310 << bench_vec_not_found.size() << " samples. Running benchmarks."
311 << endl;
312
313 unsigned bench_size = unsigned(bench_vec.size());
314 {
315 {
316 bm::chrono_taker tt1("3. std::lower_bound() search", bench_size, &timing_map);
317 for (const string& term : bench_vec)
318 {
319 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), term);
320 if (it != str_vec.end())
321 {
322 string_vector::size_type idx =
323 string_vector::size_type(std::distance(str_vec.begin(), it));
324 bv1.set(unsigned(idx));
325 }
326 } // for
327 }
328 {
329 bm::chrono_taker tt2("3a. std::lower_bound() search (empty)", bench_size, &timing_map);
330 for (const string& term : bench_vec_not_found)
331 {
332 std::lower_bound(str_vec.begin(), str_vec.end(), term);
333 } // for
334 }
335 }
336
337 {
338 // construct std::map<> (RB-tree)
339 std::map<string, unsigned> str_map;
340 for (string_vector::size_type i = 0; i < str_vec.size(); ++i)
341 {
342 const string& s = str_vec[i];
343 str_map[s] = unsigned(i);
344 } // for
345 {
346 bm::chrono_taker tt1("4. std::map<> search", bench_size, &timing_map);
347 for (const string& term : bench_vec)
348 {
349 auto it = str_map.find(term);
350 if (it != str_map.end())
351 {
352 bv2.set(unsigned(it->second));
353 }
354 } // for
355 }
356 {
357 bm::chrono_taker tt2("4a. std::map<> search (empty)", bench_size, &timing_map);
358 for (const string& term : bench_vec_not_found)
359 {
360 auto it = str_map.find(term);
361 if (it != str_map.end())
362 {
363 cerr << "empty search returned value..." << endl;
364 }
365 } // for
366 }
367 }
368
369 {
371 {
372 bm::chrono_taker tt1("5. bm::sparse_vector_scanner<> search", bench_size, &timing_map);
373 for (const string& term : bench_vec)
374 {
375 unsigned pos;
376 bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
377 if (found)
378 {
379 bv3.set(pos);
380 }
381 } // for
382 }
383 {
384 bm::chrono_taker tt1("5a. bm::sparse_vector_scanner<> search (empty)", bench_size, &timing_map);
385 for (const string& term : bench_vec_not_found)
386 {
387 unsigned pos;
388 bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
389 if (found)
390 {
391 cerr << "scanner empty search returned value..." << endl;
392 }
393 } // for
394 }
395 }
396
397 {
399 scanner.bind(str_sv, true); // attach SV as permanent search parameter to share cached values
400
401 {
402 bm::chrono_taker tt1("6. bm::sparse_vector_scanner<> binary search", bench_size, &timing_map);
403 for (const string& term : bench_vec)
404 {
405 unsigned pos;
406 bool found = scanner.bfind_eq_str(term.c_str(), pos);
407 if (found)
408 {
409 bv4.set(pos);
410 }
411 } // for
412 }
413 {
414 bm::chrono_taker tt2("6a. bm::sparse_vector_scanner<> binary search (empty)", bench_size, &timing_map);
415 for (const string& term : bench_vec_not_found)
416 {
417 unsigned pos;
418 bool found = scanner.bfind_eq_str(term.c_str(), pos);
419 if (found)
420 {
421 cerr << "scanner empty search returned value..." << endl;
422 }
423 } // for
424 }
425 }
426
427 // various integrity checks
428 //
429 int cmp = bv1.compare(bv2);
430 if (cmp != 0)
431 throw runtime_error("Error. RB-search mismatch!");
432 cmp = bv1.compare(bv3);
433 if (cmp != 0)
434 throw runtime_error("Error. scanner mismatch!");
435
436 cmp = bv1.compare(bv4);
437 if (cmp != 0)
438 throw runtime_error("Error. binary scanner mismatch!");
439
440 if (bv1.count() != bench_size)
441 throw runtime_error("Error. Search result missing elements!");
442
443
444}
445
446
447int main(int argc, char *argv[])
448{
449 if (argc < 3)
450 {
451 show_help();
452 return 1;
453 }
454
455 string_vector str_vec; // dictionary vector (STL)
456 str_sparse_vect str_sv; // bit-transposed sparse vector
457
458 try
459 {
460 auto ret = parse_args(argc, argv);
461 if (ret != 0)
462 {
463 return ret;
464 }
465 if (!i_name.empty())
466 {
467 auto res = load_dict_report(i_name, str_vec);
468 if (res != 0)
469 {
470 return res;
471 }
472 cout << "Loaded " << str_vec.size() << " dictionary names." << endl;
473
474 std::sort(str_vec.begin(), str_vec.end());
475 }
476
477 if (str_vec.size()) // load the sparse vector
478 {
479 bm::chrono_taker tt1("2. build sparse vector", 1, &timing_map);
480
481 for (const string& term : str_vec)
482 str_sv.push_back(term);
483
484 // build remapped (dense) sparse vector
485 // (this should be final), no more edits in this form
486 {
487 str_sparse_vect str_sv_remap;
488 str_sv_remap.remap_from(str_sv);
489 str_sv.swap(str_sv_remap);
490 }
491
493 str_sv.optimize(tb); // memory optimization after load
494 }
495
496 // save SV vector to file
497 if (!sv_out_name.empty() && !str_sv.empty())
498 {
499 bm::chrono_taker tt1("7. Save sparse vector", 1, &timing_map);
500 file_save_svector(str_sv, sv_out_name);
501 }
502
503 if (!sv_in_name.empty())
504 {
505 {
506 bm::chrono_taker tt1("8. Load sparse vector", 1, &timing_map);
507 file_load_svector(str_sv, sv_in_name);
508 }
509 if (str_vec.empty())
510 {
511 for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
512 {
513 string s;
514 str_sv.get(i, s);
515 str_vec.emplace_back(std::move(s));
516 } // for
517 }
518 }
519
520
521 if (is_diag)
522 {
523 if (!str_sv.empty())
524 {
525 print_svector_stat(str_sv, true);
526 }
527
528 if (str_vec.size())
529 {
530 size_t total_size = 0;
531 for (const string& term : str_vec)
532 {
533 total_size += term.size();
534 }
535 cout << "String dictionary size: "
536 << total_size / 1024 << "KB (" << total_size / (1024*1024) << "MB)"
537 << endl;
538 }
539
540 if (str_sv.size() && str_vec.size())
541 {
542 cout << "Run full comparison check..." << endl;
543 check_sparse(str_sv, str_vec); // run a paranoiya check
544 cout << "Ok" << endl;
545 }
546 }
547
548 if (is_bench) // run set of benchmarks
549 {
550 run_benchmark(str_sv, str_vec);
551 }
552
553 if (is_timing) // print all collected timings
554 {
555 std::cout << std::endl << "Performance:" << std::endl;
557 }
558 }
559 catch (std::exception& ex)
560 {
561 std::cerr << "Error:" << ex.what() << std::endl;
562 return 1;
563 }
564
565 return 0;
566}
567
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition bm.h:47
Algorithms for bvector<> (main include)
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Serialization for sparse_vector<>
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal)
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
Definition bm.h:1440
size_type count() const BMNOEXCEPT
population cout (count of ON bits)
Definition bm.h:2194
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition bm.h:3581
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition bm.h:3159
Utility class to collect performance measurements and statistics.
Definition bmtimer.h:40
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition bmtimer.h:65
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition bmtimer.h:127
algorithms for sparse_vector scan/search
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
bool bfind_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
binary find first sparse vector element (string) Sparse vector must be sorted.
bool find_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
find first sparse vector element (string)
sparse vector for strings with compression using bit transposition method
void remap_from(const str_sparse_vector &str_sv)
Build remapping profile and load content from another sparse vector.
bool empty() const
return true if vector is empty
void swap(str_sparse_vector &str_sv) BMNOEXCEPT
void push_back(const StrType &str)
push back a string
bvector_type::size_type size_type
size_type size() const
return size of the vector
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, MAX_STR_SIZE >::statistics *stat=0)
run memory optimization for all vector plains
size_type get(size_type idx, value_type *str, size_type buf_size) const BMNOEXCEPT
get specified element
std::random_device rand_dev
Definition sample11.cpp:49
std::uniform_int_distribution rand_dis(1, int(vector_max))
std::mt19937 gen(rand_dev())
int main(void)
Definition sample1.cpp:38
static int parse_args(int argc, char *argv[])
Definition xsample05.cpp:93
bool is_bench
Definition xsample05.cpp:89
std::string sv_in_name
Definition xsample05.cpp:85
bm::chrono_taker::duration_map_type timing_map
static void show_help()
Print help.
Definition xsample05.cpp:68
vector< string > string_vector
bool is_diag
Definition xsample05.cpp:87
static int load_dict_report(const std::string &fname, string_vector &str_vec)
Parse the input file and extract dictionary values.
std::string sv_out_name
Definition xsample05.cpp:84
static void run_benchmark(const str_sparse_vect &str_sv, const string_vector &str_vec)
static void pick_benchmark_set(string_vector &bench_vec, string_vector &bench_vec_not_found, const string_vector &str_vec)
Sample a few random terms out of collection.
static void check_sparse(const str_sparse_vect &str_sv, const string_vector &str_vec)
Compare STL vector with bit-transposed container to check correcness.
std::string i_name
Definition xsample05.cpp:86
const unsigned benchmark_max
bool is_timing
Definition xsample05.cpp:88
bm::str_sparse_vector< char, bm::bvector<>, 64 > str_sparse_vect