BitMagic-C++
bmalgo_similarity.h
Go to the documentation of this file.
1#ifndef BMALGO_SIMILARITY__H__INCLUDED__
2#define BMALGO_SIMILARITY__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21#include <algorithm>
22#include <functional>
23#include <vector>
24
25#ifndef BM__H__INCLUDED__
26// BitMagic utility headers do not include main "bm.h" declaration
27// #include "bm.h" or "bm64.h" explicitly
28# error missing include (bm.h or bm64.h)
29#endif
30
31
32#include "bmfunc.h"
33#include "bmdef.h"
34
35#include "bmalgo_impl.h"
36
37namespace bm
38{
39
40/*! Similarity descriptor between two objects (bit vectors, blocks, etc)
41 \internal
42*/
43template <typename SO, unsigned DMD_SZ, typename IDX_VALUE, typename SValue, typename SFunc>
45{
46public:
48 typedef SValue similarity_value_type;
49 typedef SFunc similarity_functor;
50public:
52 : so1_(0), so2_(0), so1_idx_(0), so2_idx_(0)
53 {
54 similarity_ = 0;
55 }
56
57 similarity_descriptor(const SO* so1, const SO* so2,
58 const distance_metric_descriptor* dmd_ptr)
59 :so1_(so1),
60 so2_(so2),
61 so1_idx_(0), so2_idx_(0)
62 {
63 for (size_t i = 0; i < DMD_SZ; ++i)
64 dmd_[i] = dmd_ptr[i];
65 similarity_ = 0;
66 }
67
68 similarity_descriptor(const SO* so1, IDX_VALUE i1,
69 const SO* so2, IDX_VALUE i2,
70 const distance_metric_descriptor* dmd_ptr)
71 :so1_(so1), so2_(so2), so1_idx_(i1), so2_idx_(i2)
72 {
73 for (size_t i = 0; i < DMD_SZ; ++i)
74 dmd_[i] = dmd_ptr[i];
75 similarity_ = 0;
76 }
77
80 so1_(sd.so1_),
81 so2_(sd.so2_),
84 {
85 for (size_t i = 0; i < DMD_SZ; ++i)
86 dmd_[i] = sd.dmd_[i];
87 }
89 {
91 so1_ = sd.so1_; so2_ = sd.so2_;
93 for (size_t i = 0; i < DMD_SZ; ++i)
94 dmd_[i] = sd.dmd_[i];
95 return *this;
96 }
97
98 bool operator > (const similarity_descriptor& sd) const
99 {
100 return similarity_ > sd.similarity_;
101 }
102
103 SValue similarity() const { return similarity_; }
104 void set_similarity(SValue s) { similarity_ = s;}
105
106 const SO* get_first() const { return so1_; }
107 const SO* get_second() const { return so2_; }
108
109 IDX_VALUE get_first_idx() const { return so1_idx_; }
110 IDX_VALUE get_second_idx() const { return so2_idx_; }
111
114
115 void set_metric(size_t i, distance_metric metric)
116 {
117 BM_ASSERT(i < DMD_SZ);
118 dmd_[i].metric = metric;
119 }
120
121
122protected:
123 SValue similarity_; //< final similarity product
124 const SO* so1_; //< object 1 for similarity comparison
125 const SO* so2_; //< object 2 for similarity comparison
126 IDX_VALUE so1_idx_; //< index of object 1
127 IDX_VALUE so2_idx_; //< index of object 2
128 distance_metric_descriptor dmd_[DMD_SZ]; //< array of distance operations defined on objects
129};
130
131/*!
132 Batch of objects for similarity measurement
133 \internal
134*/
135template<class SDESCR>
137{
139 typedef typename SDESCR::similarity_object_type similarity_object_type;
140 typedef typename SDESCR::similarity_value_type similarity_value_type;
141 typedef typename SDESCR::similarity_functor similarity_functor;
142 typedef std::vector<SDESCR> vector_type;
143
144 /// run the similarity calculation using distance metrics engine
146 {
147 for( size_t i = 0; i < descr_vect_.size(); ++i)
148 {
150
151 const similarity_object_type* so1 = sdescr.get_first();
152 const similarity_object_type* so2 = sdescr.get_second();
153
154 distance_metric_descriptor* dit = sdescr.distance_begin();
155 distance_metric_descriptor* dit_end = sdescr.distance_end();
156 bm::distance_operation(*so1, *so2, dit, dit_end);
157
158 // reduce: use functor to compute final similarity metric
160 similarity_value_type d = func(dit, dit_end);
161 sdescr.set_similarity(d);
162 } // for i
163 }
164
165 void sort()
166 {
167 std::sort(descr_vect_.begin(), descr_vect_.end(), std::greater<SDESCR>());
168 }
169
170 void reserve(size_t cap)
171 {
172 descr_vect_.reserve(cap);
173 }
174
176 {
177 descr_vect_.push_back(sdt);
178 }
179
180public:
181 std::vector<SDESCR> descr_vect_;
182};
183
184
185/**
186 Utility function to build jaccard similarity batch for sparse_vector<>
187 \internal
188*/
189template<class SIMBATCH, class SV>
190void build_jaccard_similarity_batch(SIMBATCH& sbatch, const SV& sv)
191{
192
193 size_t plains = sv.plains();
194 sbatch.reserve((plains * plains) / 2);
195
197 dmd[0].metric = bm::COUNT_AND;
198 dmd[1].metric = bm::COUNT_OR;
199
200 // build a batch for triangular distance matrix
201 //
202 for (unsigned i = 0; i < plains; ++i)
203 {
204 const typename SV::bvector_type* bv1 = sv.get_plain(i);
205 if (bv1)
206 {
207 for (unsigned j = i+1; j < plains; ++j)
208 {
209 const typename SV::bvector_type* bv2 = sv.get_plain(j);
210 if (bv2 && bv1 != bv2)
211 {
212 sbatch.push_back(typename SIMBATCH::similaruty_descriptor_type(bv1, i, bv2, j, &dmd[0]));
213 }
214 } // for j
215 }
216
217 } // for i
218}
219
220
221
222} // namespace bm
223
224#include "bmundef.h"
225
226#endif
Algorithms for bvector<>
Definitions(internal)
#define BM_ASSERT
Definition bmdef.h:130
Bit manipulation primitives (internal)
pre-processor un-defines to avoid global space pollution (internal)
distance_metric_descriptor dmd_[DMD_SZ]
IDX_VALUE get_second_idx() const
similarity_descriptor(const SO *so1, IDX_VALUE i1, const SO *so2, IDX_VALUE i2, const distance_metric_descriptor *dmd_ptr)
similarity_descriptor(const similarity_descriptor &sd)
similarity_descriptor(const SO *so1, const SO *so2, const distance_metric_descriptor *dmd_ptr)
void set_metric(size_t i, distance_metric metric)
similarity_descriptor & operator=(const similarity_descriptor &sd)
distance_metric_descriptor * distance_end()
distance_metric_descriptor * distance_begin()
const SO * get_second() const
IDX_VALUE get_first_idx() const
bool operator>(const similarity_descriptor &sd) const
void distance_operation(const BV &bv1, const BV &bv2, distance_metric_descriptor *dmit, distance_metric_descriptor *dmit_end) BMNOEXCEPT
Distance computing template function.
distance_metric
Distance metrics codes defined for vectors A and B.
Definition bmalgo_impl.h:58
@ COUNT_AND
(A & B).count()
Definition bmalgo_impl.h:59
@ COUNT_OR
(A | B).count()
Definition bmalgo_impl.h:61
Definition bm.h:77
void build_jaccard_similarity_batch(SIMBATCH &sbatch, const SV &sv)
Utility function to build jaccard similarity batch for sparse_vector<>
Distance metric descriptor, holds metric code and result.
Definition bmalgo_impl.h:88
std::vector< SDESCR > vector_type
std::vector< SDESCR > descr_vect_
SDESCR::similarity_value_type similarity_value_type
SDESCR::similarity_object_type similarity_object_type
void reserve(size_t cap)
void push_back(const similaruty_descriptor_type &sdt)
SDESCR::similarity_functor similarity_functor
void calculate()
run the similarity calculation using distance metrics engine