1#ifndef BTLLIB_BLOOM_FILTER_HPP
2#define BTLLIB_BLOOM_FILTER_HPP
4#include "btllib/nthash.hpp"
18static const uint8_t BIT_MASKS[CHAR_BIT] = {
20 0x01, 0x02, 0x04, 0x08,
21 0x10, 0x20, 0x40, 0x80
24static const char*
const BLOOM_FILTER_SIGNATURE =
"[BTLBloomFilter_v6]";
25static const char*
const KMER_BLOOM_FILTER_SIGNATURE =
26 "[BTLKmerBloomFilter_v6]";
27static const char*
const SEED_BLOOM_FILTER_SIGNATURE =
28 "[BTLSeedBloomFilter_v6]";
29static const char*
const HASH_FN = NTHASH_FN_NAME;
31static const unsigned MAX_HASH_VALUES = 1024;
32static const unsigned PLACEHOLDER_NEWLINES = 50;
35class BloomFilterInitializer
39 BloomFilterInitializer(
const std::string& path,
const std::string& signature)
42 , table(parse_header(signature))
45 static bool check_file_signature(std::ifstream& ifs,
46 const std::string& expected_signature,
47 std::string& file_signature);
51 std::shared_ptr<cpptoml::table> table;
53 BloomFilterInitializer(
const BloomFilterInitializer&) =
delete;
54 BloomFilterInitializer(BloomFilterInitializer&&) =
default;
56 BloomFilterInitializer& operator=(
const BloomFilterInitializer&) =
delete;
57 BloomFilterInitializer& operator=(BloomFilterInitializer&&) =
default;
62 std::shared_ptr<cpptoml::table> parse_header(
const std::string& signature);
80 BloomFilter(
size_t bytes,
unsigned hash_num, std::string hash_fn =
"");
108 void insert(
const std::vector<uint64_t>& hashes) {
insert(hashes.data()); }
127 bool contains(
const std::vector<uint64_t>& hashes)
const
172 void save(
const std::string& path);
174 static void save(
const std::string& path,
175 const cpptoml::table& table,
186 return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
189 static bool check_file_signature(
const std::string& path,
190 const std::string& signature);
193 BloomFilter(
const std::shared_ptr<BloomFilterInitializer>& bfi);
201 size_t array_bits = 0;
202 unsigned hash_num = 0;
204 std::unique_ptr<std::atomic<uint8_t>[]> array;
245 void insert(
const char* seq,
size_t seq_len);
252 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
267 void insert(
const std::vector<uint64_t>& hashes)
269 bloom_filter.
insert(hashes);
280 unsigned contains(
const char* seq,
size_t seq_len)
const;
291 return contains(seq.c_str(), seq.size());
302 return bloom_filter.
contains(hashes);
310 bool contains(
const std::vector<uint64_t>& hashes)
const
312 return bloom_filter.
contains(hashes);
373 unsigned get_k()
const {
return k; }
384 void save(
const std::string& path);
393 return btllib::BloomFilter::check_file_signature(
394 path, KMER_BLOOM_FILTER_SIGNATURE);
426 const std::vector<std::string>& seeds,
427 unsigned hash_num_per_seed);
448 void insert(
const char* seq,
size_t seq_len);
455 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
463 void insert(
const uint64_t* hashes) { kmer_bloom_filter.
insert(hashes); }
470 void insert(
const std::vector<uint64_t>& hashes)
472 kmer_bloom_filter.
insert(hashes);
485 std::vector<std::vector<unsigned>>
contains(
const char* seq,
486 size_t seq_len)
const;
497 std::vector<std::vector<unsigned>>
contains(
const std::string& seq)
const
499 return contains(seq.c_str(), seq.size());
511 return kmer_bloom_filter.
contains(hashes);
520 bool contains(
const std::vector<uint64_t>& hashes)
const
522 return kmer_bloom_filter.
contains(hashes);
599 const std::vector<std::string>&
get_seeds()
const {
return seeds; }
626 void save(
const std::string& path);
635 return btllib::BloomFilter::check_file_signature(
636 path, SEED_BLOOM_FILTER_SIGNATURE);
642 std::vector<std::string> seeds;
643 std::vector<SpacedSeed> parsed_seeds;
Definition: bloom_filter.hpp:67
bool contains(const uint64_t *hashes) const
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:127
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:108
void insert(const uint64_t *hashes)
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:184
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:165
unsigned get_hash_num() const
Definition: bloom_filter.hpp:161
BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn="")
void save(const std::string &path)
size_t get_bytes() const
Definition: bloom_filter.hpp:155
double get_occupancy() const
bool contains_insert(const uint64_t *hashes)
BloomFilter()
Definition: bloom_filter.hpp:71
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:149
uint64_t get_pop_cnt() const
BloomFilter(const std::string &path)
Definition: bloom_filter.hpp:211
void insert(const char *seq, size_t seq_len)
unsigned contains_insert(const char *seq, size_t seq_len)
double get_fpr() const
Definition: bloom_filter.hpp:371
BloomFilter & get_bloom_filter()
Definition: bloom_filter.hpp:377
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:267
void insert(const std::string &seq)
Definition: bloom_filter.hpp:252
unsigned get_hash_num() const
Definition: bloom_filter.hpp:369
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:391
unsigned contains(const char *seq, size_t seq_len) const
unsigned contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:332
KmerBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:365
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:345
KmerBloomFilter()
Definition: bloom_filter.hpp:215
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:300
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:260
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:375
void save(const std::string &path)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:357
size_t get_bytes() const
Definition: bloom_filter.hpp:363
double get_occupancy() const
Definition: bloom_filter.hpp:367
unsigned contains(const std::string &seq) const
Definition: bloom_filter.hpp:289
unsigned get_k() const
Definition: bloom_filter.hpp:373
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:310
KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k)
Definition: bloom_filter.hpp:410
unsigned get_total_hash_num() const
Definition: bloom_filter.hpp:589
double get_occupancy() const
Definition: bloom_filter.hpp:586
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:509
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:549
void insert(const char *seq, size_t seq_len)
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:520
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
void save(const std::string &path)
SeedBloomFilter(size_t bytes, unsigned k, const std::vector< std::string > &seeds, unsigned hash_num_per_seed)
const std::vector< SpacedSeed > & get_parsed_seeds() const
Definition: bloom_filter.hpp:602
KmerBloomFilter & get_kmer_bloom_filter()
Definition: bloom_filter.hpp:619
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:470
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:576
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:633
unsigned get_hash_num_per_seed() const
Definition: bloom_filter.hpp:607
SeedBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:584
size_t get_bytes() const
Definition: bloom_filter.hpp:582
unsigned get_k() const
Definition: bloom_filter.hpp:597
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:463
const std::vector< std::string > & get_seeds() const
Definition: bloom_filter.hpp:599
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:614
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
SeedBloomFilter()
Definition: bloom_filter.hpp:414
unsigned get_hash_num() const
Definition: bloom_filter.hpp:612
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition: bloom_filter.hpp:497
void insert(const std::string &seq)
Definition: bloom_filter.hpp:455
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:563
Definition: bloom_filter.hpp:16