btllib
mi_bloom_filter.hpp
1#ifndef BTLLIB_MI_BLOOM_FILTER_HPP
2#define BTLLIB_MI_BLOOM_FILTER_HPP
3
4#include "btllib/nthash.hpp"
5#include "btllib/status.hpp"
6
7#include "sdsl/bit_vector_il.hpp"
8#include "sdsl/rank_support.hpp"
9
10#include <algorithm> // std::random_shuffle
11#include <cassert>
12#include <cmath>
13#include <cstdint>
14#include <cstdio>
15#include <cstdlib>
16#include <cstring>
17#include <fstream>
18#include <iostream>
19#include <limits>
20#include <string>
21#include <sys/stat.h> // NOLINT
22#include <vector>
23
24namespace btllib {
25
26template<typename T>
28{
29public:
30 static const T MASK = 1 << (sizeof(T) * 8 - 1);
31 static const T ANTI_MASK = (T)~MASK;
32
33 static const T STRAND = 1 << (sizeof(T) * 8 - 2);
34 static const T ANTI_STRAND = (T)~STRAND;
35
36 static const T ID_MASK = ANTI_STRAND & ANTI_MASK;
37
38 static const unsigned BLOCKSIZE = 512;
39
40 // Calculates the per frame probability of a random match for single value
41 static inline double calc_prob_single_frame(double occupancy,
42 unsigned hash_num,
43 double freq,
44 unsigned allowed_misses)
45 {
46 double prob_total = 0.0;
47 for (unsigned i = hash_num - allowed_misses; i <= hash_num; i++) {
48 double prob = n_choose_k(hash_num, i);
49 prob *= pow(occupancy, i);
50 prob *= pow(1.0 - occupancy, hash_num - i);
51 prob *= (1.0 - pow(1.0 - freq, i));
52 prob_total += prob;
53 }
54 return prob_total;
55 }
56
57 static inline double calc_prob_single(double occupancy, double freq)
58 {
59 return occupancy * freq;
60 }
61
62 /*
63 * Returns an a filter size large enough to maintain an occupancy specified
64 */
65 static size_t calc_optimal_size(size_t entries,
66 unsigned hash_num,
67 double occupancy)
68 {
69 auto non_64_approx_val =
70 size_t(-double(entries) * double(hash_num) / log(1.0 - occupancy));
71 const int magic = 64;
72 return non_64_approx_val + (magic - non_64_approx_val % magic);
73 }
74
75 /*
76 * Inserts a set of hash values into an sdsl bitvector and returns the number
77 * of collisions Thread safe on the bv, though return values will not be the
78 * same run to run
79 */
80 static unsigned insert(sdsl::bit_vector& bv,
81 const uint64_t* hash_values,
82 unsigned hash_num)
83 {
84 unsigned colli_count = 0;
85 for (unsigned i = 0; i < hash_num; ++i) {
86 const int magic = 0x3f;
87 uint64_t pos = hash_values[i] % bv.size();
88 uint64_t* data_index = bv.data() + (pos >> 6); // NOLINT
89 uint64_t bit_mask_value = (uint64_t)1 << (pos & magic);
90 colli_count +=
91 __sync_fetch_and_or(data_index, bit_mask_value) >> (pos & magic) & 1;
92 }
93 return colli_count;
94 }
95
96 // TODO: include allowed miss in header
98#pragma pack(1) // to maintain consistent values across platforms
99 struct FileHeader
100 {
101 char magic[8]; // NOLINT
102 uint32_t hlen; // header length (including spaced seeds)
103 uint64_t size;
104 uint32_t nhash;
105 uint32_t kmer;
106 uint32_t version;
107 // uint8_t allowed_miss;
108 };
110
111 /*
112 * Constructor using a prebuilt bitvector
113 */
115 unsigned hash_num,
116 unsigned kmer_size,
117 sdsl::bit_vector& bv,
118 const std::vector<std::string>& seeds = std::vector<std::string>(0))
119 : m_d_size(0)
120 , m_hash_num(hash_num)
121 , m_kmer_size(kmer_size)
122 , m_sseeds(seeds)
123 , m_prob_saturated(0)
124 {
125 m_bv = sdsl::bit_vector_il<BLOCKSIZE>(bv);
126 bv = sdsl::bit_vector();
127 if (!seeds.empty()) {
128 m_ss_val = parse_seeds(m_sseeds);
129 assert(m_sseeds[0].size() == kmer_size);
130 for (auto itr = m_sseeds.begin(); itr != m_sseeds.end(); ++itr) {
131 // check if spaced seeds are all the same length
132 assert(m_kmer_size == itr->size());
133 }
134 }
135 m_rank_support = sdsl::rank_support_il<1>(&m_bv);
136 m_d_size = get_pop();
137 m_data = new T[m_d_size]();
138 }
139
140 MIBloomFilter<T>(const std::string& filter_file_path)
141 : m_prob_saturated(pow(double(get_pop_saturated()) / double(get_pop()),
142 m_hash_num)) // TODO: make more streamlined
143 {
144#pragma omp parallel for default(none) shared(filter_file_path)
145 for (unsigned i = 0; i < 2; ++i) {
146 if (i == 0) {
147 FILE* file = fopen(filter_file_path.c_str(), "rbe");
148 check_error(file == nullptr,
149 "MIBloomFilter: File " + filter_file_path +
150 " could not be read.");
151
152 FileHeader header;
153 check_error(fread(&header, sizeof(struct FileHeader), 1, file) != 1,
154 "MIBloomFilter: Failed to load header.");
155 log_info("MIBloomFilter: Loading header...");
156
157 const int magic_nine = 9;
158 char magic[magic_nine];
159 const int magic_eight = 8;
160 memcpy(magic, header.magic, magic_eight);
161 magic[magic_eight] = '\0';
162
163 log_info("MIBloomFilter: Loaded header\nmagic: " + std::string(magic) +
164 "\nhlen: " + std::to_string(header.hlen) +
165 "\nsize: " + std::to_string(header.size) +
166 "\nnhash: " + std::to_string(header.nhash) +
167 "\nkmer: " + std::to_string(header.kmer));
168
169 m_hash_num = header.nhash;
170 m_kmer_size = header.kmer;
171 m_d_size = header.size;
172 m_data = new T[m_d_size]();
173
174 if (header.hlen > sizeof(struct FileHeader)) {
175 // load seeds
176 for (unsigned i = 0; i < header.nhash; ++i) {
177 char temp[header.kmer];
178
179 check_error(fread(temp, header.kmer, 1, file) != 1,
180 "MIBloomFilter: Failed to load spaced seed string.");
181 log_info("MIBloomFilter: Spaced seed " + std::to_string(i) + ": " +
182 std::string(temp, header.kmer));
183 m_sseeds.push_back(std::string(temp, header.kmer));
184 }
185
186 m_ss_val = parse_seeds(m_sseeds);
187 assert(m_sseeds[0].size() == m_kmer_size);
188 for (auto itr = m_sseeds.begin(); itr != m_sseeds.end(); ++itr) {
189 // check if spaced seeds are all the same length
190 assert(m_kmer_size == itr->size());
191 }
192 }
193
195 header.hlen != (sizeof(FileHeader) + m_kmer_size * m_sseeds.size()),
196 "MIBloomFilter: header length: " + std::to_string(header.hlen) +
197 " does not match expected length: " +
198 std::to_string(sizeof(FileHeader) + m_kmer_size * m_sseeds.size()) +
199 " (likely version mismatch).");
200
201 check_error(strcmp(magic, "MIBLOOMF") != 0,
202 "MIBloomFilter: Bloom filter type does not matc.");
203
204 check_error(header.version != MI_BLOOM_FILTER_VERSION,
205 "MIBloomFilter: Bloom filter version does not match: " +
206 std::to_string(header.version) + " expected " +
207 std::to_string(MI_BLOOM_FILTER_VERSION) + ".");
208
209 log_info("MIBloomFilter: Loading data vector");
210
211 long int l_cur_pos = ftell(file);
212 fseek(file, 0, 2);
213 size_t file_size = ftell(file) - header.hlen;
214 fseek(file, l_cur_pos, 0);
215
216 check_error(file_size != m_d_size * sizeof(T),
217 "MIBloomFilter: " + filter_file_path +
218 " does not match size given by its header. Size: " +
219 std::to_string(file_size) + " vs " +
220 std::to_string(m_d_size * sizeof(T)) + " bytes.");
221
222 size_t count_read = fread(m_data, file_size, 1, file);
223
224 check_error(count_read != 1 && fclose(file) != 0,
225 "MIBloomFilter: File " + filter_file_path +
226 " could not be read.");
227 }
228
229 else {
230 std::string bv_filename = filter_file_path + ".sdsl";
231 log_info("MIBloomFilter: Loading sdsl interleaved bit vector from: " +
232 bv_filename);
233 load_from_file(m_bv, bv_filename);
234 m_rank_support = sdsl::rank_support_il<1>(&m_bv);
235 }
236 }
237
238 log_info("MIBloomFilter: Bit vector size: " + std::to_string(m_bv.size()) +
239 "\nPopcount: " + std::to_string(get_pop()));
240 }
241
242 /*
243 * Stores the filter as a binary file to the path specified
244 * Stores uncompressed because the random data tends to
245 * compress poorly anyway
246 */
247 void store(std::string const& filter_file_path) const
248 {
249
250#pragma omp parallel for default(none) shared(filter_file_path)
251 for (unsigned i = 0; i < 2; ++i) {
252 if (i == 0) {
253 std::ofstream my_file(filter_file_path.c_str(),
254 std::ios::out | std::ios::binary);
255
256 assert(my_file);
257 write_header(my_file);
258
259 // std::cerr << "Storing filter. Filter is
260 //"
261 //<<
262 // m_d_size * sizeof(T)
263 // << " bytes." <<
264 // std::endl;
265
266 // write out each block
267 my_file.write(reinterpret_cast<char*>(m_data), m_d_size * sizeof(T));
268
269 my_file.close();
270 assert(my_file);
271
272 FILE* file = fopen(filter_file_path.c_str(), "rbe");
273 check_error(file == nullptr,
274 "MIBloomFilter: " + filter_file_path +
275 " could not be read.");
276 } else {
277 std::string bv_filename = filter_file_path + ".sdsl";
278 // std::cerr << "Storing sdsl interleaved
279 // bit
280 // vector to: " << bv_filename
281 // << std::endl;
282 store_to_file(m_bv, bv_filename);
283 // std::cerr << "Number of bit vector
284 // buckets is
285 //"
286 //<< m_bv.size()
287 // << std::endl;
288 // std::cerr << "Uncompressed bit vector
289 // size is
290 //"
291 // << (m_bv.size() +
292 // m_bv.size()
293 //* 64
295 // << " bytes" <<
296 // std::endl;
297 }
298 }
299 }
300
301 /*
302 * Returns false if unable to insert hashes values
303 * Contains strand information
304 * Inserts hash functions in random order
305 */
306 bool insert(const uint64_t* hashes, const bool* strand, T val, unsigned max)
307 {
308 unsigned count = 0;
309 std::vector<unsigned> hash_order;
310 bool saturated = true;
311 // for random number generator seed
312 uint64_t rand_value = val;
313 bool strand_dir = true;
314 if (max % 2 == 0) {
315 strand_dir = false;
316 }
317
318 // check values and if value set
319 for (unsigned i = 0; i < m_hash_num; ++i) {
320 // check if values are already set
321 uint64_t pos = m_rank_support(hashes[i] % m_bv.size());
322 T value = strand_dir ^ strand[i] ? val | STRAND : val;
323 // check for saturation
324 T old_val = m_data[pos];
325
326 if (old_val > MASK) {
327 old_val = old_val & ANTI_MASK;
328 } else {
329 saturated = false;
330 }
331
332 if (old_val == value) {
333 ++count;
334 } else {
335 hash_order.push_back(i);
336 }
337
338 if (count >= max) {
339 return true;
340 }
341 rand_value ^= hashes[i];
342 }
343 std::minstd_rand g(rand_value);
344 std::shuffle(hash_order.begin(), hash_order.end(), g);
345
346 // insert seeds in random order
347 for (const auto& o : hash_order) {
348 uint64_t pos = m_rank_support(hashes[o] % m_bv.size());
349 T value = strand_dir ^ strand[o] ? val | STRAND : val;
350 // check for saturation
351 T old_val = set_val(&m_data[pos], value);
352
353 if (old_val > MASK) {
354 old_val = old_val & ANTI_MASK;
355 } else {
356 saturated = false;
357 }
358
359 if (old_val == 0) {
360 ++count;
361 }
362
363 if (count >= max) {
364 return true;
365 }
366 }
367
368 if (count == 0) {
369 if (!saturated) {
370 assert(
371 max ==
372 1); // if this triggers then spaced seed is probably not symmetric
373 saturate(hashes);
374 }
375 return false;
376 }
377 return true;
378 }
379
380 /*
381 * Returns false if unable to insert hashes values
382 * Inserts hash functions in random order
383 */
384 bool insert(const uint64_t* hashes, T value, unsigned max)
385 {
386 unsigned count = 0;
387 std::vector<unsigned> hash_order;
388 // for random number generator seed
389 uint64_t rand_value = value;
390
391 bool saturated = true;
392
393 // check values and if value set
394 for (unsigned i = 0; i < m_hash_num; ++i) {
395 // check if values are already set
396 uint64_t pos = m_rank_support(hashes[i] % m_bv.size());
397 // check for saturation
398 T old_val = m_data[pos];
399
400 if (old_val > MASK) {
401 old_val = old_val & ANTI_MASK;
402 } else {
403 saturated = false;
404 }
405
406 if (old_val == value) {
407 ++count;
408 } else {
409 hash_order.push_back(i);
410 }
411
412 if (count >= max) {
413 return true;
414 }
415
416 rand_value ^= hashes[i];
417 }
418 std::minstd_rand g(rand_value);
419 std::shuffle(hash_order.begin(), hash_order.end(), g);
420
421 // insert seeds in random order
422 for (const auto& o : hash_order) {
423 uint64_t pos = m_rank_support(hashes[o] % m_bv.size());
424 // check for saturation
425 T old_val = set_val(&m_data[pos], value);
426
427 if (old_val > MASK) {
428 old_val = old_val & ANTI_MASK;
429 } else {
430 saturated = false;
431 }
432
433 if (old_val == 0) {
434 ++count;
435 }
436
437 if (count >= max) {
438 return true;
439 }
440 }
441
442 if (count == 0) {
443 if (!saturated) {
444 assert(
445 max ==
446 1); // if this triggers then spaced seed is probably not symmetric
447 saturate(hashes);
448 }
449 return false;
450 }
451 return true;
452 }
453
454 void saturate(const uint64_t* hashes)
455 {
456 for (unsigned i = 0; i < m_hash_num; ++i) {
457 uint64_t pos = m_rank_support(hashes[i] % m_bv.size());
458 __sync_or_and_fetch(&m_data[pos], MASK);
459 }
460 }
461
462 inline std::vector<T> at(const uint64_t* hashes,
463 bool& saturated,
464 unsigned max_miss = 0)
465 {
466 std::vector<T> results(m_hash_num);
467 unsigned misses = 0;
468 for (unsigned i = 0; i < m_hash_num; ++i) {
469 uint64_t pos = hashes[i] % m_bv.size();
470 if (m_bv[pos] == 0) {
471 ++misses;
472 saturated = false;
473 if (misses > max_miss) {
474 return std::vector<T>();
475 }
476 } else {
477 uint64_t rank_pos = m_rank_support(pos);
478 T temp_result = m_data[rank_pos];
479 if (temp_result > MASK) {
480 results[i] = m_data[rank_pos] & ANTI_MASK;
481 } else {
482 results[i] = m_data[rank_pos];
483 saturated = false;
484 }
485 }
486 }
487 return results;
488 }
489
490 /*
491 * Populates rank pos vector. Boolean vector is use to confirm if hits are
492 * good Returns total number of misses found
493 */
494 unsigned at_rank(const uint64_t* hashes,
495 std::vector<uint64_t>& rank_pos,
496 std::vector<bool>& hits,
497 unsigned max_miss) const
498 {
499 unsigned misses = 0;
500 for (unsigned i = 0; i < m_hash_num; ++i) {
501 uint64_t pos = hashes[i] % m_bv.size();
502 if (bool(m_bv[pos])) {
503 rank_pos[i] = m_rank_support(pos);
504 hits[i] = true;
505 } else {
506 if (++misses > max_miss) {
507 return misses;
508 }
509
510 hits[i] = false;
511 }
512 }
513 return misses;
514 }
515
516 /*
517 * For k-mers
518 * Returns if match succeeded
519 */
520 bool at_rank(const uint64_t* hashes, std::vector<uint64_t>& rank_pos) const
521 {
522 for (unsigned i = 0; i < m_hash_num; ++i) {
523 uint64_t pos = hashes[i] % m_bv.size();
524 if (bool(m_bv[pos])) {
525 rank_pos[i] = m_rank_support(pos);
526 } else {
527 return false;
528 }
529 }
530 return true;
531 }
532
533 std::vector<uint64_t> get_rank_pos(const uint64_t* hashes) const
534 {
535 std::vector<uint64_t> rank_pos(m_hash_num);
536 for (unsigned i = 0; i < m_hash_num; ++i) {
537 uint64_t pos = hashes[i] % m_bv.size();
538 rank_pos[i] = m_rank_support(pos);
539 }
540 return rank_pos;
541 }
542
543 uint64_t get_rank_pos(const uint64_t hash) const
544 {
545 return m_rank_support(hash % m_bv.size());
546 }
547
548 const std::vector<std::vector<unsigned>>& get_seed_values() const
549 {
550 return m_ss_val;
551 }
552
553 unsigned get_kmer_size() const { return m_kmer_size; }
554
555 unsigned get_hash_num() const { return m_hash_num; }
556
557 /*
558 * Computes id frequency based on data vector contents
559 * Returns counts of repetitive sequence
560 */
561 size_t get_id_counts(std::vector<size_t>& counts) const
562 {
563 size_t saturated_counts = 0;
564 for (size_t i = 0; i < m_d_size; ++i) {
565 if (m_data[i] > MASK) {
566 ++counts[m_data[i] & ANTI_MASK];
567 ++saturated_counts;
568 } else {
569 ++counts[m_data[i]];
570 }
571 }
572 return saturated_counts;
573 }
574
575 /*
576 * computes id frequency based on datavector
577 * Returns counts of repetitive sequence
578 */
579 size_t get_id_counts_strand(std::vector<size_t>& counts) const
580 {
581 size_t saturated_counts = 0;
582 for (size_t i = 0; i < m_d_size; ++i) {
583 if (m_data[i] > MASK) {
584 ++counts[m_data[i] & ID_MASK];
585 ++saturated_counts;
586 } else {
587 ++counts[m_data[i] & ANTI_STRAND];
588 }
589 }
590 return saturated_counts;
591 }
592
593 size_t get_pop() const
594 {
595 size_t index = m_bv.size() - 1;
596 while (m_bv[index] == 0) {
597 --index;
598 }
599 return m_rank_support(index) + 1;
600 }
601
602 /*
603 * Mostly for debugging
604 * should equal get_pop if fully populated
605 */
606 size_t get_pop_non_zero() const
607 {
608 size_t count = 0;
609 for (size_t i = 0; i < m_d_size; ++i) {
610 if (m_data[i] != 0) {
611 ++count;
612 }
613 }
614 return count;
615 }
616
617 /*
618 * Checks data array for abnormal IDs
619 * (i.e. values greater than what is possible)
620 * Returns first abnormal ID or value of max_val if no abnormal IDs are found
621 * For debugging
622 */
623 T check_values(T max_val) const
624 {
625 for (size_t i = 0; i < m_d_size; ++i) {
626 if ((m_data[i] & ANTI_MASK) > max_val) {
627 return m_data[i];
628 }
629 }
630 return max_val;
631 }
632
633 size_t get_pop_saturated() const
634 {
635 size_t count = 0;
636 for (size_t i = 0; i < m_d_size; ++i) {
637 if (m_data[i] > MASK) {
638 ++count;
639 }
640 }
641 return count;
642 }
643
644 size_t size() const { return m_bv.size(); }
645
646 // overwrites existing value CAS
647 void set_data(uint64_t pos, T id)
648 {
649 T old_value;
650 do {
651 old_value = m_data[pos];
652 if (old_value > MASK) {
653 id |= MASK;
654 }
655 } while (!__sync_bool_compare_and_swap(&m_data[pos], old_value, id));
656 }
657
658 // saturates values
659 void saturate_data(uint64_t pos)
660 {
661#pragma omp critical
662 m_data[pos] |= MASK;
663 }
664
665 // Does not overwrite
666 void set_data_if_empty(uint64_t pos, T id) { set_val(&m_data[pos], id); }
667
668 std::vector<T> get_data(const std::vector<uint64_t>& rank_pos) const
669 {
670 std::vector<T> results(rank_pos.size());
671 for (unsigned i = 0; i < m_hash_num; ++i) {
672 results[i] = m_data[rank_pos[i]];
673 }
674 return results;
675 }
676
677 T get_data(uint64_t rank) const { return m_data[rank]; }
678
679 /*
680 * Preconditions:
681 * frame_probs but be equal in size to multiMatchProbs
682 * frame_probs must be preallocated to correct size (number of ids + 1)
683 * Max value is the largest value seen in your set of possible values
684 * Returns proportion of saturated elements relative to all elements
685 */
686 double calc_frame_probs(std::vector<double>& frame_probs,
687 unsigned allowed_miss)
688 {
689 double occupancy = double(get_pop()) / double(size());
690 std::vector<size_t> count_table =
691 std::vector<size_t>(frame_probs.size(), 0);
692 double sat_prop = double(get_id_counts(count_table));
693 size_t sum = 0;
694 for (size_t i = 1; i < count_table.size(); ++i) {
695 sum += count_table[i];
696 }
697 sat_prop /= double(sum);
698 for (size_t i = 1; i < count_table.size(); ++i) {
699 frame_probs[i] =
700 calc_prob_single_frame(occupancy,
701 m_hash_num,
702 double(count_table[i]) / double(sum),
703 allowed_miss);
704 }
705 return sat_prop;
706 }
707
708 /*
709 * Preconditions:
710 * frame_probs but be equal in size to multiMatchProbs
711 * frame_probs must be preallocated to correct size (number of ids + 1)
712 * Max value is the largest value seen in your set of possible values
713 * Returns proportion of saturated elements relative to all elements
714 */
715 double calc_frame_probs_strand(std::vector<double>& frame_probs,
716 unsigned allowed_miss)
717 {
718 double occupancy = double(get_pop()) / double(size());
719 std::vector<size_t> count_table =
720 std::vector<size_t>(frame_probs.size(), 0);
721 double sat_prop = double(get_id_counts_strand(count_table));
722 size_t sum = 0;
723 for (const auto& c : count_table) {
724 sum += c;
725 }
726 sat_prop /= double(sum);
727#pragma omp parallel for default(none) shared(count_table)
728 for (size_t i = 1; i < count_table.size(); ++i) {
729 frame_probs[i] =
730 calc_prob_single_frame(occupancy,
731 m_hash_num,
732 double(count_table[i]) / double(sum),
733 allowed_miss);
734 // frame_probs[i] = calc_prob_single(occupancy,
735 // double(count_table[i]) /
736 // double(sum));
737 }
738 return sat_prop;
739 }
740
741 ~MIBloomFilter() { delete[] m_data; }
742
743private:
744 // Driver function to sort the std::vector elements
745 // by second element of pairs
746 static bool sort_by_sec(const std::pair<int, int>& a,
747 const std::pair<int, int>& b)
748 {
749 return (a.second < b.second);
750 }
751
752 /*
753 * Helper function for header storage
754 */
755 void write_header(std::ofstream& out) const
756 {
757 FileHeader header;
758 const int magic_num = 8;
759 memcpy(header.magic, "MIBLOOMF", magic_num);
760
761 header.hlen = sizeof(struct FileHeader) + m_kmer_size * m_sseeds.size();
762 header.kmer = m_kmer_size;
763 header.size = m_d_size;
764 header.nhash = m_hash_num;
765 header.version = MI_BLOOM_FILTER_VERSION;
766
767 // std::cerr << "Writing header... magic: " << magic << " hlen: "
768 //<<
769 // header.hlen
770 // << " nhash: " << header.nhash << " size: " <<
771 // header.size
772 // << std::endl;
773
774 out.write(reinterpret_cast<char*>(&header), sizeof(struct FileHeader));
775
776 for (const auto& s : m_sseeds) {
777 out.write(s.c_str(), m_kmer_size);
778 }
779 }
780
781 /*
782 * Calculates the optimal number of hash function to use
783 * Calculation assumes optimal ratio of bytes per entry given a fpr
784 */
785 inline static unsigned calc_opti_hash_num(double fpr)
786 {
787 return unsigned(-log(fpr) / log(2));
788 }
789
790 /*
791 * Calculate FPR based on hash functions, size and number of entries
792 * see http://en.wikipedia.org/wiki/Bloom_filter
793 */
794 double calc_fpr_num_inserted(size_t num_entr) const
795 {
796 return pow(1.0 - pow(1.0 - 1.0 / double(m_bv.size()),
797 double(num_entr) * double(m_hash_num)),
798 double(m_hash_num));
799 }
800
801 /*
802 * Calculates the optimal FPR to use based on hash functions
803 */
804 double calc_fpr_hash_num(int hash_funct_num) const
805 {
806 const double magic = 2.0;
807 return pow(magic, -hash_funct_num);
808 }
809
810 /*
811 * Returns old value that was inside
812 * Does not overwrite if non-zero value already exists
813 */
814 T set_val(T* val, T new_val)
815 {
816 T old_value;
817 do {
818 old_value = *val;
819 if (old_value != 0) {
820 break;
821 }
822 } while (!__sync_bool_compare_and_swap(val, old_value, new_val));
823 return old_value;
824 }
825
826 static inline unsigned n_choose_k(unsigned n, unsigned k)
827 {
828 if (k > n) {
829 return 0;
830 }
831 if (k * 2 > n) {
832 k = n - k;
833 }
834 if (k == 0) {
835 return 1;
836 }
837 unsigned result = n;
838 for (unsigned i = 2; i <= k; ++i) {
839 result *= (n - i + 1);
840 result /= i;
841 }
842 return result;
843 }
844
845 // size of bitvector
846 size_t m_d_size;
847
848 sdsl::bit_vector_il<BLOCKSIZE> m_bv;
849 T* m_data;
850 sdsl::rank_support_il<1> m_rank_support;
851
852 unsigned m_hash_num;
853 unsigned m_kmer_size;
854
855 using seed_val = std::vector<std::vector<unsigned>>;
856 std::vector<std::string> m_sseeds;
857
858 double m_prob_saturated;
859 seed_val m_ss_val;
860
861 static const uint32_t MI_BLOOM_FILTER_VERSION = 1;
862};
863
864} // namespace btllib
865
866#endif
Definition: mi_bloom_filter.hpp:28
void store(std::string const &filter_file_path) const
Definition: mi_bloom_filter.hpp:247
Definition: bloom_filter.hpp:16
void check_error(bool condition, const std::string &msg)
void log_info(const std::string &msg)