1#ifndef BTLLIB_COUNTING_BLOOM_FILTER_HPP
2#define BTLLIB_COUNTING_BLOOM_FILTER_HPP
4#include "btllib/bloom_filter.hpp"
5#include "btllib/counting_bloom_filter.hpp"
6#include "btllib/nthash.hpp"
7#include "btllib/status.hpp"
23static const char*
const COUNTING_BLOOM_FILTER_SIGNATURE =
24 "[BTLCountingBloomFilter_v5]";
26static const char*
const KMER_COUNTING_BLOOM_FILTER_SIGNATURE =
27 "[BTLKmerCountingBloomFilter_v5]";
30class KmerCountingBloomFilter;
54 std::string hash_fn =
"");
75 void insert(
const uint64_t* hashes);
82 void insert(
const std::vector<uint64_t>& hashes) {
insert(hashes.data()); }
92 T
contains(
const uint64_t* hashes)
const;
101 T
contains(
const std::vector<uint64_t>& hashes)
const
224 double get_fpr(T threshold = 1)
const;
233 void save(
const std::string& path);
242 return btllib::BloomFilter::check_file_signature(
243 path, COUNTING_BLOOM_FILTER_SIGNATURE);
249 void insert(
const uint64_t* hashes, T min_val);
254 size_t array_size = 0;
255 unsigned hash_num = 0;
257 std::unique_ptr<std::atomic<T>[]> array;
301 void insert(
const char* seq,
size_t seq_len);
308 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
316 void insert(
const uint64_t* hashes) { counting_bloom_filter.insert(hashes); }
323 void insert(
const std::vector<uint64_t>& hashes)
325 counting_bloom_filter.insert(hashes);
336 uint64_t
contains(
const char* seq,
size_t seq_len)
const;
347 return contains(seq.c_str(), seq.size());
360 return counting_bloom_filter.contains(hashes);
370 T
contains(
const std::vector<uint64_t>& hashes)
const
372 return counting_bloom_filter.contains(hashes);
407 return counting_bloom_filter.contains_insert(hashes);
419 return counting_bloom_filter.contains_insert(hashes);
455 return counting_bloom_filter.insert_contains(hashes);
467 return counting_bloom_filter.insert_contains(hashes);
509 return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
526 return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
568 return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
583 return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
587 size_t get_bytes()
const {
return counting_bloom_filter.get_bytes(); }
591 return counting_bloom_filter.get_pop_cnt(threshold);
596 return counting_bloom_filter.get_occupancy(threshold);
599 unsigned get_hash_num()
const {
return counting_bloom_filter.get_hash_num(); }
608 return counting_bloom_filter.get_fpr(threshold);
611 unsigned get_k()
const {
return k; }
615 return counting_bloom_filter.get_hash_fn();
620 return counting_bloom_filter;
628 void save(
const std::string& path);
638 return btllib::BloomFilter::check_file_signature(
639 path, KMER_COUNTING_BLOOM_FILTER_SIGNATURE);
649using CountingBloomFilter8 = CountingBloomFilter<uint8_t>;
650using CountingBloomFilter16 = CountingBloomFilter<uint16_t>;
651using CountingBloomFilter32 = CountingBloomFilter<uint32_t>;
653using KmerCountingBloomFilter8 = KmerCountingBloomFilter<uint8_t>;
654using KmerCountingBloomFilter16 = KmerCountingBloomFilter<uint16_t>;
655using KmerCountingBloomFilter32 = KmerCountingBloomFilter<uint32_t>;
662 size_t(std::ceil(double(bytes) / sizeof(uint64_t)) * sizeof(uint64_t)))
663 , array_size(get_bytes() / sizeof(array[0]))
665 , hash_fn(std::move(hash_fn))
666 , array(new std::atomic<T>[array_size])
668 check_error(bytes == 0,
"CountingBloomFilter: memory budget must be >0!");
670 "CountingBloomFilter: number of hash values must be >0!");
672 hash_num > MAX_HASH_VALUES,
673 "CountingBloomFilter: number of hash values cannot be over 1024!");
674 check_warning(
sizeof(uint8_t) !=
sizeof(std::atomic<uint8_t>),
675 "Atomic primitives take extra memory. CountingBloomFilter will "
677 std::to_string(bytes) +
" for bit array.");
678 std::memset((
void*)array.get(), 0, array_size *
sizeof(array[0]));
689 bool update_done =
false;
690 T new_val, tmp_min_val;
692 new_val = min_val + 1;
693 for (
size_t i = 0; i < hash_num; ++i) {
694 tmp_min_val = min_val;
695 update_done = array[hashes[i] % array_size].compare_exchange_strong(
696 tmp_min_val, new_val);
699 (min_val = contains(hashes)) == std::numeric_limits<T>::max()) {
709 contains_insert(hashes);
716 T min = array[hashes[0] % array_size];
717 for (
size_t i = 1; i < hash_num; ++i) {
718 const size_t idx = hashes[i] % array_size;
719 if (array[idx] < min) {
730 const auto count = contains(hashes);
731 if (count < std::numeric_limits<T>::max()) {
732 insert(hashes, count);
741 const auto count = contains(hashes);
742 if (count < std::numeric_limits<T>::max()) {
743 insert(hashes, count);
746 return std::numeric_limits<T>::max();
754 const auto count = contains(hashes);
755 if (count < threshold) {
756 insert(hashes, count);
767 const auto count = contains(hashes);
768 if (count < threshold) {
769 insert(hashes, count);
778 uint64_t pop_cnt = 0;
782#pragma omp parallel for reduction(+ : pop_cnt)
783 for (
size_t i = 0; i < array_size; ++i) {
784 if (array[i] >= threshold) {
795 return double(get_pop_cnt(threshold)) / double(array_size);
802 return std::pow(get_occupancy(threshold),
double(hash_num));
808 std::make_shared<BloomFilterInitializer>(path,
809 COUNTING_BLOOM_FILTER_SIGNATURE))
814 const std::shared_ptr<BloomFilterInitializer>& bfi)
815 : bytes(*bfi->table->get_as<decltype(bytes)>(
"bytes"))
816 , array_size(bytes / sizeof(array[0]))
817 , hash_num(*(bfi->table->get_as<decltype(hash_num)>(
"hash_num")))
818 , hash_fn(bfi->table->contains(
"hash_fn")
819 ? *(bfi->table->get_as<decltype(hash_fn)>(
"hash_fn"))
821 , array(new std::atomic<T>[array_size])
823 check_warning(
sizeof(uint8_t) !=
sizeof(std::atomic<uint8_t>),
824 "Atomic primitives take extra memory. CountingBloomFilter will "
826 std::to_string(bytes) +
" for bit array.");
827 const auto loaded_counter_bits =
828 *(bfi->table->get_as<
size_t>(
"counter_bits"));
829 check_error(
sizeof(array[0]) * CHAR_BIT != loaded_counter_bits,
830 "CountingBloomFilter" +
831 std::to_string(
sizeof(array[0]) * CHAR_BIT) +
832 " tried to load a file of CountingBloomFilter" +
833 std::to_string(loaded_counter_bits));
834 bfi->ifs.read((
char*)array.get(),
835 std::streamsize(array_size *
sizeof(array[0])));
846 auto root = cpptoml::make_table();
850 auto header = cpptoml::make_table();
851 header->insert(
"bytes", get_bytes());
852 header->insert(
"hash_num", get_hash_num());
853 if (!hash_fn.empty()) {
854 header->insert(
"hash_fn", hash_fn);
856 header->insert(
"counter_bits",
size_t(
sizeof(array[0]) * CHAR_BIT));
857 std::string header_string = COUNTING_BLOOM_FILTER_SIGNATURE;
859 header_string.substr(1, header_string.size() - 2);
860 root->insert(header_string, header);
863 path, *root, (
char*)array.get(), array_size *
sizeof(array[0]));
871 , counting_bloom_filter(bytes, hash_num, HASH_FN)
878 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
879 while (nthash.
roll()) {
880 counting_bloom_filter.insert(nthash.hashes());
889 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
890 while (nthash.
roll()) {
891 sum += counting_bloom_filter.contains(nthash.hashes());
901 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
902 while (nthash.
roll()) {
903 sum += counting_bloom_filter.contains_insert(nthash.hashes());
913 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
914 while (nthash.
roll()) {
915 sum += counting_bloom_filter.insert_contains(nthash.hashes());
927 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
928 while (nthash.
roll()) {
930 counting_bloom_filter.insert_thresh_contains(nthash.hashes(), threshold);
942 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
943 while (nthash.
roll()) {
945 counting_bloom_filter.contains_insert_thresh(nthash.hashes(), threshold);
952 const std::string& path)
954 std::make_shared<BloomFilterInitializer>(
956 KMER_COUNTING_BLOOM_FILTER_SIGNATURE))
961 const std::shared_ptr<BloomFilterInitializer>& bfi)
962 : k(*(bfi->table->get_as<decltype(k)>(
"k")))
963 , counting_bloom_filter(bfi)
965 check_error(counting_bloom_filter.hash_fn != HASH_FN,
966 "KmerCountingBloomFilter: loaded hash function (" +
967 counting_bloom_filter.hash_fn +
968 ") is different from the one used by default (" + HASH_FN +
980 auto root = cpptoml::make_table();
984 auto header = cpptoml::make_table();
985 header->insert(
"bytes", get_bytes());
986 header->insert(
"hash_num", get_hash_num());
987 header->insert(
"hash_fn", get_hash_fn());
988 header->insert(
"counter_bits",
989 size_t(
sizeof(counting_bloom_filter.array[0]) * CHAR_BIT));
990 header->insert(
"k", k);
991 std::string header_string = KMER_COUNTING_BLOOM_FILTER_SIGNATURE;
993 header_string.substr(1, header_string.size() - 2);
994 root->insert(header_string, header);
998 (
char*)counting_bloom_filter.array.get(),
999 counting_bloom_filter.array_size *
1000 sizeof(counting_bloom_filter.array[0]));
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:39
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:175
void insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:82
double get_occupancy(T threshold=1) const
Definition: counting_bloom_filter.hpp:793
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:226
T contains_insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:728
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:240
double get_fpr(T threshold=1) const
Definition: counting_bloom_filter.hpp:800
uint64_t get_pop_cnt(T threshold=1) const
Definition: counting_bloom_filter.hpp:776
T insert_contains(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:146
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:840
T contains_insert_thresh(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter.hpp:764
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:101
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:210
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:217
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter.hpp:714
CountingBloomFilter()
Definition: counting_bloom_filter.hpp:43
T insert_contains(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:739
void insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:707
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:203
T insert_thresh_contains(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter.hpp:751
T contains_insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:123
Definition: counting_bloom_filter.hpp:267
uint64_t contains(const char *seq, size_t seq_len) const
Definition: counting_bloom_filter.hpp:886
T insert_contains(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:465
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:523
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:974
CountingBloomFilter< T > & get_counting_bloom_filter()
Definition: counting_bloom_filter.hpp:618
T insert_contains(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:910
T insert_thresh_contains(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:491
T contains_insert_thresh(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:566
T insert_thresh_contains(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter.hpp:922
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter.hpp:358
T contains_insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:405
T contains_insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:417
T contains_insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:392
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:587
T contains_insert_thresh(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter.hpp:937
T insert_thresh_contains(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:507
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:580
T insert_contains(const std::string &seq)
Definition: counting_bloom_filter.hpp:439
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:613
void insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:323
void insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:876
double get_occupancy(T threshold=1) const
Definition: counting_bloom_filter.hpp:594
T contains_insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:898
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:599
uint64_t get_pop_cnt(T threshold=1) const
Definition: counting_bloom_filter.hpp:589
T insert_contains(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:453
double get_fpr(T threshold=1) const
Definition: counting_bloom_filter.hpp:606
unsigned get_k() const
Definition: counting_bloom_filter.hpp:611
void insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:308
uint64_t contains(const std::string &seq) const
Definition: counting_bloom_filter.hpp:345
void insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:316
T contains_insert_thresh(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:550
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:636
KmerCountingBloomFilter()
Definition: counting_bloom_filter.hpp:271
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:370
Definition: nthash.hpp:54
Definition: bloom_filter.hpp:16
void check_error(bool condition, const std::string &msg)
void check_warning(bool condition, const std::string &msg)