btllib
bloom_filter.hpp
1#ifndef BTLLIB_BLOOM_FILTER_HPP
2#define BTLLIB_BLOOM_FILTER_HPP
3
4#include "btllib/nthash.hpp"
5
6#include "cpptoml.h"
7
8#include <atomic>
9#include <climits>
10#include <cstdint>
11#include <fstream>
12#include <memory>
13#include <string>
14#include <vector>
15
16namespace btllib {
17
18static const uint8_t BIT_MASKS[CHAR_BIT] = {
19 // NOLINT
20 0x01, 0x02, 0x04, 0x08, // NOLINT
21 0x10, 0x20, 0x40, 0x80 // NOLINT
22};
23
24static const char* const BLOOM_FILTER_SIGNATURE = "[BTLBloomFilter_v6]";
25static const char* const KMER_BLOOM_FILTER_SIGNATURE =
26 "[BTLKmerBloomFilter_v6]";
27static const char* const SEED_BLOOM_FILTER_SIGNATURE =
28 "[BTLSeedBloomFilter_v6]";
29static const char* const HASH_FN = NTHASH_FN_NAME;
30
31static const unsigned MAX_HASH_VALUES = 1024;
32static const unsigned PLACEHOLDER_NEWLINES = 50;
33
35class BloomFilterInitializer
36{
37
38public:
39 BloomFilterInitializer(const std::string& path, const std::string& signature)
40 : path(path)
41 , ifs(path)
42 , table(parse_header(signature))
43 {}
44
45 static bool check_file_signature(std::ifstream& ifs,
46 const std::string& expected_signature,
47 std::string& file_signature);
48
49 std::string path;
50 std::ifstream ifs;
51 std::shared_ptr<cpptoml::table> table;
52
53 BloomFilterInitializer(const BloomFilterInitializer&) = delete;
54 BloomFilterInitializer(BloomFilterInitializer&&) = default;
55
56 BloomFilterInitializer& operator=(const BloomFilterInitializer&) = delete;
57 BloomFilterInitializer& operator=(BloomFilterInitializer&&) = default;
58
59private:
62 std::shared_ptr<cpptoml::table> parse_header(const std::string& signature);
63};
65
67{
68
69public:
72
80 BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn = "");
81
87 explicit BloomFilter(const std::string& path);
88
89 BloomFilter(const BloomFilter&) = delete;
90 BloomFilter(BloomFilter&&) = delete;
91
92 BloomFilter& operator=(const BloomFilter&) = delete;
93 BloomFilter& operator=(BloomFilter&&) = delete;
94
101 void insert(const uint64_t* hashes);
102
108 void insert(const std::vector<uint64_t>& hashes) { insert(hashes.data()); }
109
118 bool contains(const uint64_t* hashes) const;
119
127 bool contains(const std::vector<uint64_t>& hashes) const
128 {
129 return contains(hashes.data());
130 }
131
140 bool contains_insert(const uint64_t* hashes);
141
149 bool contains_insert(const std::vector<uint64_t>& hashes)
150 {
151 return contains_insert(hashes.data());
152 }
153
155 size_t get_bytes() const { return bytes; }
157 uint64_t get_pop_cnt() const;
159 double get_occupancy() const;
161 unsigned get_hash_num() const { return hash_num; }
163 double get_fpr() const;
165 const std::string& get_hash_fn() const { return hash_fn; }
166
172 void save(const std::string& path);
173
174 static void save(const std::string& path,
175 const cpptoml::table& table,
176 const char* data,
177 size_t n);
178
184 static bool is_bloom_file(const std::string& path)
185 {
186 return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
187 }
188
189 static bool check_file_signature(const std::string& path,
190 const std::string& signature);
191
192private:
193 BloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
194
195 friend class KmerBloomFilter;
196 friend class SeedBloomFilter;
197
198 size_t bytes = 0;
199 size_t array_size =
200 0; // Should be equal to bytes, but not guaranteed by standard
201 size_t array_bits = 0;
202 unsigned hash_num = 0;
203 std::string hash_fn;
204 std::unique_ptr<std::atomic<uint8_t>[]> array;
205};
206
211{
212
213public:
216
224 KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k);
225
231 explicit KmerBloomFilter(const std::string& path);
232
233 KmerBloomFilter(const KmerBloomFilter&) = delete;
235
236 KmerBloomFilter& operator=(const KmerBloomFilter&) = delete;
237 KmerBloomFilter& operator=(KmerBloomFilter&&) = delete;
238
245 void insert(const char* seq, size_t seq_len);
246
252 void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
253
260 void insert(const uint64_t* hashes) { bloom_filter.insert(hashes); }
261
267 void insert(const std::vector<uint64_t>& hashes)
268 {
269 bloom_filter.insert(hashes);
270 }
271
280 unsigned contains(const char* seq, size_t seq_len) const;
281
289 unsigned contains(const std::string& seq) const
290 {
291 return contains(seq.c_str(), seq.size());
292 }
293
300 bool contains(const uint64_t* hashes) const
301 {
302 return bloom_filter.contains(hashes);
303 }
304
310 bool contains(const std::vector<uint64_t>& hashes) const
311 {
312 return bloom_filter.contains(hashes);
313 }
314
323 unsigned contains_insert(const char* seq, size_t seq_len);
324
332 unsigned contains_insert(const std::string& seq)
333 {
334 return contains_insert(seq.c_str(), seq.size());
335 }
336
345 bool contains_insert(const uint64_t* hashes)
346 {
347 return bloom_filter.contains_insert(hashes);
348 }
349
357 bool contains_insert(const std::vector<uint64_t>& hashes)
358 {
359 return bloom_filter.contains_insert(hashes);
360 }
361
363 size_t get_bytes() const { return bloom_filter.get_bytes(); }
365 uint64_t get_pop_cnt() const { return bloom_filter.get_pop_cnt(); }
367 double get_occupancy() const { return bloom_filter.get_occupancy(); }
369 unsigned get_hash_num() const { return bloom_filter.get_hash_num(); }
371 double get_fpr() const { return bloom_filter.get_fpr(); }
373 unsigned get_k() const { return k; }
375 const std::string& get_hash_fn() const { return bloom_filter.get_hash_fn(); }
377 BloomFilter& get_bloom_filter() { return bloom_filter; }
378
384 void save(const std::string& path);
385
391 static bool is_bloom_file(const std::string& path)
392 {
393 return btllib::BloomFilter::check_file_signature(
394 path, KMER_BLOOM_FILTER_SIGNATURE);
395 }
396
397private:
398 KmerBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
399
400 friend class SeedBloomFilter;
401
402 unsigned k = 0;
403 BloomFilter bloom_filter;
404};
405
410{
411
412public:
415
424 SeedBloomFilter(size_t bytes,
425 unsigned k,
426 const std::vector<std::string>& seeds,
427 unsigned hash_num_per_seed);
428
434 explicit SeedBloomFilter(const std::string& path);
435
436 SeedBloomFilter(const SeedBloomFilter&) = delete;
438
439 SeedBloomFilter& operator=(const SeedBloomFilter&) = delete;
440 SeedBloomFilter& operator=(SeedBloomFilter&&) = delete;
441
448 void insert(const char* seq, size_t seq_len);
449
455 void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
456
463 void insert(const uint64_t* hashes) { kmer_bloom_filter.insert(hashes); }
464
470 void insert(const std::vector<uint64_t>& hashes)
471 {
472 kmer_bloom_filter.insert(hashes);
473 }
474
485 std::vector<std::vector<unsigned>> contains(const char* seq,
486 size_t seq_len) const;
487
497 std::vector<std::vector<unsigned>> contains(const std::string& seq) const
498 {
499 return contains(seq.c_str(), seq.size());
500 }
501
509 bool contains(const uint64_t* hashes) const
510 {
511 return kmer_bloom_filter.contains(hashes);
512 }
513
520 bool contains(const std::vector<uint64_t>& hashes) const
521 {
522 return kmer_bloom_filter.contains(hashes);
523 }
524
536 std::vector<std::vector<unsigned>> contains_insert(const char* seq,
537 size_t seq_len);
538
549 std::vector<std::vector<unsigned>> contains_insert(const std::string& seq)
550 {
551 return contains_insert(seq.c_str(), seq.size());
552 }
553
563 bool contains_insert(const uint64_t* hashes)
564 {
565 return kmer_bloom_filter.contains_insert(hashes);
566 }
567
576 bool contains_insert(const std::vector<uint64_t>& hashes)
577 {
578 return kmer_bloom_filter.contains_insert(hashes);
579 }
580
582 size_t get_bytes() const { return kmer_bloom_filter.get_bytes(); }
584 uint64_t get_pop_cnt() const { return kmer_bloom_filter.get_pop_cnt(); }
586 double get_occupancy() const { return kmer_bloom_filter.get_occupancy(); }
589 unsigned get_total_hash_num() const
590 {
591 return get_hash_num_per_seed() * get_seeds().size();
592 }
595 double get_fpr() const;
597 unsigned get_k() const { return kmer_bloom_filter.get_k(); }
599 const std::vector<std::string>& get_seeds() const { return seeds; }
602 const std::vector<SpacedSeed>& get_parsed_seeds() const
603 {
604 return parsed_seeds;
605 }
607 unsigned get_hash_num_per_seed() const
608 {
609 return kmer_bloom_filter.get_hash_num();
610 }
612 unsigned get_hash_num() const { return get_hash_num_per_seed(); }
614 const std::string& get_hash_fn() const
615 {
616 return kmer_bloom_filter.get_hash_fn();
617 }
619 KmerBloomFilter& get_kmer_bloom_filter() { return kmer_bloom_filter; }
620
626 void save(const std::string& path);
627
633 static bool is_bloom_file(const std::string& path)
634 {
635 return btllib::BloomFilter::check_file_signature(
636 path, SEED_BLOOM_FILTER_SIGNATURE);
637 }
638
639private:
640 SeedBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
641
642 std::vector<std::string> seeds;
643 std::vector<SpacedSeed> parsed_seeds;
644 KmerBloomFilter kmer_bloom_filter;
645};
646
647} // namespace btllib
648
649#endif
Definition: bloom_filter.hpp:67
bool contains(const uint64_t *hashes) const
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:127
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:108
void insert(const uint64_t *hashes)
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:184
double get_fpr() const
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:165
unsigned get_hash_num() const
Definition: bloom_filter.hpp:161
BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn="")
void save(const std::string &path)
size_t get_bytes() const
Definition: bloom_filter.hpp:155
double get_occupancy() const
bool contains_insert(const uint64_t *hashes)
BloomFilter()
Definition: bloom_filter.hpp:71
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:149
uint64_t get_pop_cnt() const
BloomFilter(const std::string &path)
Definition: bloom_filter.hpp:211
void insert(const char *seq, size_t seq_len)
unsigned contains_insert(const char *seq, size_t seq_len)
double get_fpr() const
Definition: bloom_filter.hpp:371
BloomFilter & get_bloom_filter()
Definition: bloom_filter.hpp:377
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:267
void insert(const std::string &seq)
Definition: bloom_filter.hpp:252
unsigned get_hash_num() const
Definition: bloom_filter.hpp:369
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:391
unsigned contains(const char *seq, size_t seq_len) const
unsigned contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:332
KmerBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:365
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:345
KmerBloomFilter()
Definition: bloom_filter.hpp:215
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:300
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:260
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:375
void save(const std::string &path)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:357
size_t get_bytes() const
Definition: bloom_filter.hpp:363
double get_occupancy() const
Definition: bloom_filter.hpp:367
unsigned contains(const std::string &seq) const
Definition: bloom_filter.hpp:289
unsigned get_k() const
Definition: bloom_filter.hpp:373
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:310
KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k)
Definition: bloom_filter.hpp:410
unsigned get_total_hash_num() const
Definition: bloom_filter.hpp:589
double get_occupancy() const
Definition: bloom_filter.hpp:586
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:509
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:549
void insert(const char *seq, size_t seq_len)
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:520
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
void save(const std::string &path)
SeedBloomFilter(size_t bytes, unsigned k, const std::vector< std::string > &seeds, unsigned hash_num_per_seed)
const std::vector< SpacedSeed > & get_parsed_seeds() const
Definition: bloom_filter.hpp:602
KmerBloomFilter & get_kmer_bloom_filter()
Definition: bloom_filter.hpp:619
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:470
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:576
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:633
unsigned get_hash_num_per_seed() const
Definition: bloom_filter.hpp:607
SeedBloomFilter(const std::string &path)
double get_fpr() const
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:584
size_t get_bytes() const
Definition: bloom_filter.hpp:582
unsigned get_k() const
Definition: bloom_filter.hpp:597
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:463
const std::vector< std::string > & get_seeds() const
Definition: bloom_filter.hpp:599
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:614
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
SeedBloomFilter()
Definition: bloom_filter.hpp:414
unsigned get_hash_num() const
Definition: bloom_filter.hpp:612
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition: bloom_filter.hpp:497
void insert(const std::string &seq)
Definition: bloom_filter.hpp:455
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:563
Definition: bloom_filter.hpp:16