8#ifndef BTLLIB_NTHASH_HPP
9#define BTLLIB_NTHASH_HPP
11#include "btllib/nthash_lowlevel.hpp"
12#include "btllib/status.hpp"
22static const char*
const NTHASH_FN_NAME =
"ntHash_v2";
26using NTHASH_HASH_NUM_TYPE = uint8_t;
27static const int NTHASH_HASH_NUM_MAX =
28 std::numeric_limits<NTHASH_HASH_NUM_TYPE>::max();
30using NTHASH_K_TYPE = uint16_t;
31static const int NTHASH_K_MAX = std::numeric_limits<NTHASH_K_TYPE>::max();
36std::vector<SpacedSeed>
37parse_seeds(
const std::vector<std::string>& seed_strings);
40parse_seeds(
const std::vector<std::string>& seed_strings,
41 std::vector<SpacedSeedBlocks>& blocks,
42 std::vector<SpacedSeedMonomers>& monomers);
45parsed_seeds_to_blocks(
const std::vector<SpacedSeed>& seeds,
47 std::vector<SpacedSeedBlocks>& blocks,
48 std::vector<SpacedSeedMonomers>& monomers);
51check_seeds(
const std::vector<std::string>& seeds,
unsigned k);
78 NtHash(
const std::string& seq,
unsigned hash_num,
unsigned k,
size_t pos = 0);
135 void sub(
const std::vector<unsigned>& positions,
136 const std::vector<unsigned char>& new_bases);
138 const uint64_t* hashes()
const {
return hashes_array.get(); }
145 bool forward()
const {
return forward_hash <= reverse_hash; }
146 unsigned get_hash_num()
const {
return hash_num; }
147 unsigned get_k()
const {
return k; }
149 uint64_t get_forward_hash()
const {
return forward_hash; }
150 uint64_t get_reverse_hash()
const {
return reverse_hash; }
152 void change_seq(
const std::string& new_seq,
size_t new_pos = 0)
154 seq = new_seq.data();
155 seq_len = new_seq.length();
163 friend class SeedNtHash;
170 const NTHASH_HASH_NUM_TYPE hash_num;
171 const NTHASH_K_TYPE k;
175 std::unique_ptr<uint64_t[]> hashes_array;
176 uint64_t forward_hash = 0;
177 uint64_t reverse_hash = 0;
250 void sub(
const std::vector<unsigned>& positions,
251 const std::vector<unsigned char>& new_bases);
253 const uint64_t* hashes()
const {
return hashes_array.get(); }
260 bool forward()
const {
return forward_hash <= reverse_hash; }
261 unsigned get_hash_num()
const {
return hash_num; }
262 unsigned get_k()
const {
return k; }
264 uint64_t get_forward_hash()
const {
return forward_hash; }
265 uint64_t get_reverse_hash()
const {
return reverse_hash; }
267 void change_seq(
const std::string& new_seq,
size_t new_pos = 0)
269 seq_len = new_seq.length();
270 std::memcpy(seq.get(), new_seq.data(), seq_len);
281 std::unique_ptr<char[]> seq;
283 const NTHASH_HASH_NUM_TYPE hash_num;
284 const NTHASH_K_TYPE k;
288 std::unique_ptr<uint64_t[]> hashes_array;
289 uint64_t forward_hash = 0;
290 uint64_t reverse_hash = 0;
299 const std::vector<SpacedSeed>& seeds,
300 unsigned hash_num_per_seed,
304 const std::vector<SpacedSeed>& seeds,
305 unsigned hash_num_per_seed,
310 const std::vector<std::string>& seeds,
311 unsigned hash_num_per_seed,
315 const std::vector<std::string>& seeds,
316 unsigned hash_num_per_seed,
367 const uint64_t* hashes()
const {
return nthash.hashes(); }
369 void change_seq(
const std::string& seq,
size_t pos = 0)
371 nthash.change_seq(seq, pos);
374 size_t get_pos()
const {
return nthash.
get_pos(); }
375 bool forward()
const {
return nthash.forward(); }
376 unsigned get_hash_num()
const {
return nthash.get_hash_num(); }
377 unsigned get_hash_num_per_seed()
const {
return hash_num_per_seed; }
378 unsigned get_k()
const {
return nthash.get_k(); }
380 uint64_t* get_forward_hash()
const {
return forward_hash.get(); }
381 uint64_t* get_reverse_hash()
const {
return reverse_hash.get(); }
387 const unsigned hash_num_per_seed;
389 std::vector<SpacedSeedBlocks> blocks;
390 std::vector<SpacedSeedMonomers> monomers;
392 std::unique_ptr<uint64_t[]> fh_no_monomers;
393 std::unique_ptr<uint64_t[]> rh_no_monomers;
394 std::unique_ptr<uint64_t[]> forward_hash;
395 std::unique_ptr<uint64_t[]> reverse_hash;
399#define BTLLIB_NTHASH_INIT(CLASS, NTHASH_CALL, MEMBER_PREFIX) \
400 inline bool CLASS::init() \
402 if (MEMBER_PREFIX k > MEMBER_PREFIX seq_len) { \
403 MEMBER_PREFIX pos = std::numeric_limits<std::size_t>::max(); \
408 (MEMBER_PREFIX pos < MEMBER_PREFIX seq_len - MEMBER_PREFIX k + 1) && \
410 MEMBER_PREFIX pos += posN + 1; \
412 if (MEMBER_PREFIX pos > MEMBER_PREFIX seq_len - MEMBER_PREFIX k) { \
413 MEMBER_PREFIX pos = std::numeric_limits<std::size_t>::max(); \
416 MEMBER_PREFIX initialized = true; \
421#define BTLLIB_NTHASH_ROLL(CLASS, FN_DECL, NTHASH_CALL, MEMBER_PREFIX) \
422 inline bool CLASS::FN_DECL \
424 if (!MEMBER_PREFIX initialized) { \
427 if (MEMBER_PREFIX pos >= MEMBER_PREFIX seq_len - MEMBER_PREFIX k) { \
430 if (SEED_TAB[(unsigned char)(MEMBER_PREFIX seq[MEMBER_PREFIX pos + \
431 MEMBER_PREFIX k])] == \
433 MEMBER_PREFIX pos += MEMBER_PREFIX k; \
437 ++ MEMBER_PREFIX pos; \
442#define BTLLIB_NTHASH_ROLL_BACK(CLASS, FN_DECL, NTHASH_CALL, MEMBER_PREFIX) \
443 inline bool CLASS::FN_DECL \
445 if (!MEMBER_PREFIX initialized) { \
448 if (MEMBER_PREFIX pos <= 0) { \
451 if (SEED_TAB[(unsigned char)(MEMBER_PREFIX seq[MEMBER_PREFIX pos - 1])] == \
453 MEMBER_PREFIX pos -= MEMBER_PREFIX k; \
457 -- MEMBER_PREFIX pos; \
462#define BTLLIB_NTHASH_PEEK(CLASS, FN_DECL, NTHASH_CALL, MEMBER_PREFIX) \
463 inline bool CLASS::FN_DECL \
465 if (!MEMBER_PREFIX initialized) { \
472BTLLIB_NTHASH_INIT(NtHash,
479 hashes_array.get()), )
480BTLLIB_NTHASH_ROLL(NtHash,
490BTLLIB_NTHASH_ROLL_BACK(NtHash,
492 ntmc64l(seq[pos + k - 1],
505 uint64_t forward_hash_tmp = forward_hash;
506 uint64_t reverse_hash_tmp = reverse_hash;
520 uint64_t forward_hash_tmp = forward_hash;
521 uint64_t reverse_hash_tmp = reverse_hash;
535 uint64_t forward_hash_tmp = forward_hash;
536 uint64_t reverse_hash_tmp = reverse_hash;
537 ntmc64l(seq[pos + k - 1],
547 peek_back(
char char_in),
550 uint64_t forward_hash_tmp = forward_hash;
551 uint64_t reverse_hash_tmp = reverse_hash;
552 ntmc64l(seq[pos + k - 1],
561BTLLIB_NTHASH_INIT(BlindNtHash,
562 ntmc64(seq.get() + pos,
568 hashes_array.get()), )
573 ntmc64(seq[pos % seq_len],
580 seq[pos % seq_len] = char_in;
582BTLLIB_NTHASH_ROLL_BACK(
584 roll_back(
char char_in),
586 ntmc64l(seq[(pos + k - 1) % seq_len],
593 seq[(pos + k - 1) % seq_len] = char_in;
600 uint64_t forward_hash_tmp = forward_hash;
601 uint64_t reverse_hash_tmp = reverse_hash;
602 ntmc64(seq[pos % seq_len],
612 peek_back(
char char_in),
614 uint64_t forward_hash_tmp = forward_hash;
615 uint64_t reverse_hash_tmp = reverse_hash;
616 ntmc64l(seq[(pos + k - 1) % seq_len],
625BTLLIB_NTHASH_INIT(SeedNtHash,
626 ntmsm64(nthash.seq + nthash.pos,
632 fh_no_monomers.get(),
633 rh_no_monomers.get(),
637 nthash.hashes_array.get()),
639BTLLIB_NTHASH_ROLL(SeedNtHash,
641 ntmsm64(nthash.seq + nthash.pos,
647 fh_no_monomers.get(),
648 rh_no_monomers.get(),
651 nthash.hashes_array.get());
653BTLLIB_NTHASH_ROLL_BACK(SeedNtHash,
655 ntmsm64l(nthash.seq + nthash.pos - 1,
661 fh_no_monomers.get(),
662 rh_no_monomers.get(),
665 nthash.hashes_array.get());
672 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(
new uint64_t[blocks.size()]);
673 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(
new uint64_t[blocks.size()]);
674 std::unique_ptr<uint64_t[]> forward_hash_tmp(
new uint64_t[blocks.size()]);
675 std::unique_ptr<uint64_t[]> reverse_hash_tmp(
new uint64_t[blocks.size()]);
676 std::memcpy(fh_no_monomers_tmp.get(),
678 blocks.size() *
sizeof(uint64_t));
679 std::memcpy(rh_no_monomers_tmp.get(),
681 blocks.size() *
sizeof(uint64_t));
682 std::memcpy(forward_hash_tmp.get(),
684 blocks.size() *
sizeof(uint64_t));
685 std::memcpy(reverse_hash_tmp.get(),
687 blocks.size() *
sizeof(uint64_t));
688 ntmsm64(nthash.seq + nthash.pos,
694 fh_no_monomers_tmp.get(),
695 rh_no_monomers_tmp.get(),
696 forward_hash_tmp.get(),
697 reverse_hash_tmp.get(),
698 nthash.hashes_array.get());
706 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(
new uint64_t[blocks.size()]);
707 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(
new uint64_t[blocks.size()]);
708 std::unique_ptr<uint64_t[]> forward_hash_tmp(
new uint64_t[blocks.size()]);
709 std::unique_ptr<uint64_t[]> reverse_hash_tmp(
new uint64_t[blocks.size()]);
710 std::memcpy(fh_no_monomers_tmp.get(),
712 blocks.size() *
sizeof(uint64_t));
713 std::memcpy(rh_no_monomers_tmp.get(),
715 blocks.size() *
sizeof(uint64_t));
716 std::memcpy(forward_hash_tmp.get(),
718 blocks.size() *
sizeof(uint64_t));
719 std::memcpy(reverse_hash_tmp.get(),
721 blocks.size() *
sizeof(uint64_t));
722 ntmsm64(nthash.seq + nthash.pos,
729 fh_no_monomers_tmp.get(),
730 rh_no_monomers_tmp.get(),
731 forward_hash_tmp.get(),
732 reverse_hash_tmp.get(),
733 nthash.hashes_array.get());
741 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(
new uint64_t[blocks.size()]);
742 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(
new uint64_t[blocks.size()]);
743 std::unique_ptr<uint64_t[]> forward_hash_tmp(
new uint64_t[blocks.size()]);
744 std::unique_ptr<uint64_t[]> reverse_hash_tmp(
new uint64_t[blocks.size()]);
745 std::memcpy(fh_no_monomers_tmp.get(),
747 blocks.size() *
sizeof(uint64_t));
748 std::memcpy(rh_no_monomers_tmp.get(),
750 blocks.size() *
sizeof(uint64_t));
751 std::memcpy(forward_hash_tmp.get(),
753 blocks.size() *
sizeof(uint64_t));
754 std::memcpy(reverse_hash_tmp.get(),
756 blocks.size() *
sizeof(uint64_t));
757 ntmsm64l(nthash.seq + nthash.pos - 1,
763 fh_no_monomers_tmp.get(),
764 rh_no_monomers_tmp.get(),
765 forward_hash_tmp.get(),
766 reverse_hash_tmp.get(),
767 nthash.hashes_array.get());
772 peek_back(
char char_in),
775 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(
new uint64_t[blocks.size()]);
776 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(
new uint64_t[blocks.size()]);
777 std::unique_ptr<uint64_t[]> forward_hash_tmp(
new uint64_t[blocks.size()]);
778 std::unique_ptr<uint64_t[]> reverse_hash_tmp(
new uint64_t[blocks.size()]);
779 std::memcpy(fh_no_monomers_tmp.get(),
781 blocks.size() *
sizeof(uint64_t));
782 std::memcpy(rh_no_monomers_tmp.get(),
784 blocks.size() *
sizeof(uint64_t));
785 std::memcpy(forward_hash_tmp.get(),
787 blocks.size() *
sizeof(uint64_t));
788 std::memcpy(reverse_hash_tmp.get(),
790 blocks.size() *
sizeof(uint64_t));
791 ntmsm64l(nthash.seq + nthash.pos - 1,
798 fh_no_monomers_tmp.get(),
799 rh_no_monomers_tmp.get(),
800 forward_hash_tmp.get(),
801 reverse_hash_tmp.get(),
802 nthash.hashes_array.get());
806#undef BTLLIB_NTHASH_INIT
807#undef BTLLIB_NTHASH_ROLL
808#undef BTLLIB_NTHASH_ROLL_BACK
809#undef BTLLIB_NTHASH_PEEK
Definition: nthash.hpp:187
bool roll_back(char char_in)
BlindNtHash(const char *seq, size_t seq_len, unsigned hash_num, unsigned k, size_t pos=0)
size_t get_pos() const
Definition: nthash.hpp:259
BlindNtHash(const std::string &seq, unsigned hash_num, unsigned k, size_t pos=0)
bool peek_back(char char_in)
Definition: nthash.hpp:54
NtHash(const std::string &seq, unsigned hash_num, unsigned k, size_t pos=0)
NtHash(const char *seq, size_t seq_len, unsigned hash_num, unsigned k, size_t pos=0)
bool peek_back(char char_in)
size_t get_pos() const
Definition: nthash.hpp:144
Definition: nthash.hpp:294
bool peek_back(char char_in)
Definition: bloom_filter.hpp:16