btllib
nthash.hpp
1/*
2 * nthash.hpp
3 * Author: Hamid Mohamadi
4 * Genome Sciences Centre,
5 * British Columbia Cancer Agency
6 */
7
8#ifndef BTLLIB_NTHASH_HPP
9#define BTLLIB_NTHASH_HPP
10
11#include "btllib/nthash_lowlevel.hpp"
12#include "btllib/status.hpp"
13
14#include <cstdint>
15#include <limits>
16#include <memory>
17#include <string>
18#include <vector>
19
20namespace btllib {
21
22static const char* const NTHASH_FN_NAME = "ntHash_v2";
23
24// This lets us minimize NtHash object size. Good for performance if it's copied
25// in, e.g., DBG traversal
26using NTHASH_HASH_NUM_TYPE = uint8_t;
27static const int NTHASH_HASH_NUM_MAX =
28 std::numeric_limits<NTHASH_HASH_NUM_TYPE>::max();
29
30using NTHASH_K_TYPE = uint16_t;
31static const int NTHASH_K_MAX = std::numeric_limits<NTHASH_K_TYPE>::max();
32
33class NtHash;
34class SeedNtHash;
35
36std::vector<SpacedSeed>
37parse_seeds(const std::vector<std::string>& seed_strings);
38
39void
40parse_seeds(const std::vector<std::string>& seed_strings,
41 std::vector<SpacedSeedBlocks>& blocks,
42 std::vector<SpacedSeedMonomers>& monomers);
43
44void
45parsed_seeds_to_blocks(const std::vector<SpacedSeed>& seeds,
46 unsigned k,
47 std::vector<SpacedSeedBlocks>& blocks,
48 std::vector<SpacedSeedMonomers>& monomers);
49
50void
51check_seeds(const std::vector<std::string>& seeds, unsigned k);
52
53class NtHash
54{
55
56public:
65 NtHash(const char* seq,
66 size_t seq_len,
67 unsigned hash_num,
68 unsigned k,
69 size_t pos = 0);
70
78 NtHash(const std::string& seq, unsigned hash_num, unsigned k, size_t pos = 0);
79
80 NtHash(const NtHash& nthash);
81 NtHash(NtHash&&) = default;
82
96 bool roll();
97
103 bool roll_back();
104
112 bool peek();
113
119 bool peek_back();
120
126 bool peek(char char_in);
127
133 bool peek_back(char char_in);
134
135 void sub(const std::vector<unsigned>& positions,
136 const std::vector<unsigned char>& new_bases);
137
138 const uint64_t* hashes() const { return hashes_array.get(); }
139
144 size_t get_pos() const { return pos; }
145 bool forward() const { return forward_hash <= reverse_hash; }
146 unsigned get_hash_num() const { return hash_num; }
147 unsigned get_k() const { return k; }
148
149 uint64_t get_forward_hash() const { return forward_hash; }
150 uint64_t get_reverse_hash() const { return reverse_hash; }
151
152 void change_seq(const std::string& new_seq, size_t new_pos = 0)
153 {
154 seq = new_seq.data();
155 seq_len = new_seq.length();
156 pos = new_pos;
157 initialized = false;
158 forward_hash = 0;
159 reverse_hash = 0;
160 }
161
162private:
163 friend class SeedNtHash;
164
166 bool init();
167
168 const char* seq;
169 size_t seq_len;
170 const NTHASH_HASH_NUM_TYPE hash_num;
171 const NTHASH_K_TYPE k;
172
173 size_t pos;
174 bool initialized;
175 std::unique_ptr<uint64_t[]> hashes_array;
176 uint64_t forward_hash = 0;
177 uint64_t reverse_hash = 0;
178};
179
187{
188
189public:
198 BlindNtHash(const char* seq,
199 size_t seq_len,
200 unsigned hash_num,
201 unsigned k,
202 size_t pos = 0);
203
211 BlindNtHash(const std::string& seq,
212 unsigned hash_num,
213 unsigned k,
214 size_t pos = 0);
215
216 BlindNtHash(const BlindNtHash& nthash);
217 BlindNtHash(BlindNtHash&&) = default;
218
227 bool roll(char char_in);
228
234 bool roll_back(char char_in);
235
241 bool peek(char char_in);
242
248 bool peek_back(char char_in);
249
250 void sub(const std::vector<unsigned>& positions,
251 const std::vector<unsigned char>& new_bases);
252
253 const uint64_t* hashes() const { return hashes_array.get(); }
254
259 size_t get_pos() const { return pos; }
260 bool forward() const { return forward_hash <= reverse_hash; }
261 unsigned get_hash_num() const { return hash_num; }
262 unsigned get_k() const { return k; }
263
264 uint64_t get_forward_hash() const { return forward_hash; }
265 uint64_t get_reverse_hash() const { return reverse_hash; }
266
267 void change_seq(const std::string& new_seq, size_t new_pos = 0)
268 {
269 seq_len = new_seq.length();
270 std::memcpy(seq.get(), new_seq.data(), seq_len);
271 pos = new_pos;
272 initialized = false;
273 forward_hash = 0;
274 reverse_hash = 0;
275 }
276
277private:
279 bool init();
280
281 std::unique_ptr<char[]> seq;
282 size_t seq_len;
283 const NTHASH_HASH_NUM_TYPE hash_num;
284 const NTHASH_K_TYPE k;
285
286 size_t pos;
287 bool initialized;
288 std::unique_ptr<uint64_t[]> hashes_array;
289 uint64_t forward_hash = 0;
290 uint64_t reverse_hash = 0;
291};
292
294{
295
296public:
297 SeedNtHash(const char* seq,
298 size_t seq_len,
299 const std::vector<SpacedSeed>& seeds,
300 unsigned hash_num_per_seed,
301 unsigned k,
302 size_t pos = 0);
303 SeedNtHash(const std::string& seq,
304 const std::vector<SpacedSeed>& seeds,
305 unsigned hash_num_per_seed,
306 unsigned k,
307 size_t pos = 0);
308 SeedNtHash(const char* seq,
309 size_t seq_len,
310 const std::vector<std::string>& seeds,
311 unsigned hash_num_per_seed,
312 unsigned k,
313 size_t pos = 0);
314 SeedNtHash(const std::string& seq,
315 const std::vector<std::string>& seeds,
316 unsigned hash_num_per_seed,
317 unsigned k,
318 size_t pos = 0);
319
320 SeedNtHash(const SeedNtHash& seed_nthash);
321 SeedNtHash(SeedNtHash&&) = default;
322
329 bool roll();
330
336 bool roll_back();
337
344 bool peek();
345
351 bool peek_back();
352
358 bool peek(char char_in);
359
365 bool peek_back(char char_in);
366
367 const uint64_t* hashes() const { return nthash.hashes(); }
368
369 void change_seq(const std::string& seq, size_t pos = 0)
370 {
371 nthash.change_seq(seq, pos);
372 }
373
374 size_t get_pos() const { return nthash.get_pos(); }
375 bool forward() const { return nthash.forward(); }
376 unsigned get_hash_num() const { return nthash.get_hash_num(); }
377 unsigned get_hash_num_per_seed() const { return hash_num_per_seed; }
378 unsigned get_k() const { return nthash.get_k(); }
379
380 uint64_t* get_forward_hash() const { return forward_hash.get(); }
381 uint64_t* get_reverse_hash() const { return reverse_hash.get(); }
382
383private:
384 bool init();
385
386 NtHash nthash;
387 const unsigned hash_num_per_seed;
388
389 std::vector<SpacedSeedBlocks> blocks;
390 std::vector<SpacedSeedMonomers> monomers;
391
392 std::unique_ptr<uint64_t[]> fh_no_monomers;
393 std::unique_ptr<uint64_t[]> rh_no_monomers;
394 std::unique_ptr<uint64_t[]> forward_hash;
395 std::unique_ptr<uint64_t[]> reverse_hash;
396};
397
398// NOLINTNEXTLINE
399#define BTLLIB_NTHASH_INIT(CLASS, NTHASH_CALL, MEMBER_PREFIX) \
400 inline bool CLASS::init() \
401 { \
402 if (MEMBER_PREFIX k > MEMBER_PREFIX seq_len) { \
403 MEMBER_PREFIX pos = std::numeric_limits<std::size_t>::max(); \
404 return false; \
405 } \
406 unsigned posN = 0; \
407 while ( \
408 (MEMBER_PREFIX pos < MEMBER_PREFIX seq_len - MEMBER_PREFIX k + 1) && \
409 !(NTHASH_CALL)) { \
410 MEMBER_PREFIX pos += posN + 1; \
411 } \
412 if (MEMBER_PREFIX pos > MEMBER_PREFIX seq_len - MEMBER_PREFIX k) { \
413 MEMBER_PREFIX pos = std::numeric_limits<std::size_t>::max(); \
414 return false; \
415 } \
416 MEMBER_PREFIX initialized = true; \
417 return true; \
418 }
419
420// NOLINTNEXTLINE
421#define BTLLIB_NTHASH_ROLL(CLASS, FN_DECL, NTHASH_CALL, MEMBER_PREFIX) \
422 inline bool CLASS::FN_DECL \
423 { \
424 if (!MEMBER_PREFIX initialized) { \
425 return init(); \
426 } \
427 if (MEMBER_PREFIX pos >= MEMBER_PREFIX seq_len - MEMBER_PREFIX k) { \
428 return false; \
429 } \
430 if (SEED_TAB[(unsigned char)(MEMBER_PREFIX seq[MEMBER_PREFIX pos + \
431 MEMBER_PREFIX k])] == \
432 SEED_N) { \
433 MEMBER_PREFIX pos += MEMBER_PREFIX k; \
434 return init(); \
435 } \
436 NTHASH_CALL /* NOLINT(bugprone-macro-parentheses) */ \
437 ++ MEMBER_PREFIX pos; \
438 return true; \
439 }
440
441// NOLINTNEXTLINE
442#define BTLLIB_NTHASH_ROLL_BACK(CLASS, FN_DECL, NTHASH_CALL, MEMBER_PREFIX) \
443 inline bool CLASS::FN_DECL \
444 { \
445 if (!MEMBER_PREFIX initialized) { \
446 return init(); \
447 } \
448 if (MEMBER_PREFIX pos <= 0) { \
449 return false; \
450 } \
451 if (SEED_TAB[(unsigned char)(MEMBER_PREFIX seq[MEMBER_PREFIX pos - 1])] == \
452 SEED_N) { \
453 MEMBER_PREFIX pos -= MEMBER_PREFIX k; \
454 return init(); \
455 } \
456 NTHASH_CALL /* NOLINT(bugprone-macro-parentheses) */ \
457 -- MEMBER_PREFIX pos; \
458 return true; \
459 }
460
461// NOLINTNEXTLINE
462#define BTLLIB_NTHASH_PEEK(CLASS, FN_DECL, NTHASH_CALL, MEMBER_PREFIX) \
463 inline bool CLASS::FN_DECL \
464 { \
465 if (!MEMBER_PREFIX initialized) { \
466 return init(); \
467 } \
468 NTHASH_CALL /* NOLINT(bugprone-macro-parentheses) */ \
469 return true; \
470 }
471
472BTLLIB_NTHASH_INIT(NtHash,
473 ntmc64(seq + pos,
474 k,
475 hash_num,
476 forward_hash,
477 reverse_hash,
478 posN,
479 hashes_array.get()), )
480BTLLIB_NTHASH_ROLL(NtHash,
481 roll(),
482 ntmc64(seq[pos],
483 seq[pos + k],
484 k,
485 hash_num,
486 forward_hash,
487 reverse_hash,
488 hashes_array.get());
489 , )
490BTLLIB_NTHASH_ROLL_BACK(NtHash,
491 roll_back(),
492 ntmc64l(seq[pos + k - 1],
493 seq[pos - 1],
494 k,
495 hash_num,
496 forward_hash,
497 reverse_hash,
498 hashes_array.get());
499 , )
500BTLLIB_NTHASH_PEEK(
501 NtHash,
502 peek(),
503
504 {
505 uint64_t forward_hash_tmp = forward_hash;
506 uint64_t reverse_hash_tmp = reverse_hash;
507 ntmc64(seq[pos],
508 seq[pos + k],
509 k,
510 hash_num,
511 forward_hash_tmp,
512 reverse_hash_tmp,
513 hashes_array.get());
514 }, )
515BTLLIB_NTHASH_PEEK(
516 NtHash,
517 peek(char char_in),
518
519 {
520 uint64_t forward_hash_tmp = forward_hash;
521 uint64_t reverse_hash_tmp = reverse_hash;
522 ntmc64(seq[pos],
523 char_in,
524 k,
525 hash_num,
526 forward_hash_tmp,
527 reverse_hash_tmp,
528 hashes_array.get());
529 }, )
530BTLLIB_NTHASH_PEEK(
531 NtHash,
532 peek_back(),
533
534 {
535 uint64_t forward_hash_tmp = forward_hash;
536 uint64_t reverse_hash_tmp = reverse_hash;
537 ntmc64l(seq[pos + k - 1],
538 seq[pos - 1],
539 k,
540 hash_num,
541 forward_hash_tmp,
542 reverse_hash_tmp,
543 hashes_array.get());
544 }, )
545BTLLIB_NTHASH_PEEK(
546 NtHash,
547 peek_back(char char_in),
548
549 {
550 uint64_t forward_hash_tmp = forward_hash;
551 uint64_t reverse_hash_tmp = reverse_hash;
552 ntmc64l(seq[pos + k - 1],
553 char_in,
554 k,
555 hash_num,
556 forward_hash_tmp,
557 reverse_hash_tmp,
558 hashes_array.get());
559 }, )
560
561BTLLIB_NTHASH_INIT(BlindNtHash,
562 ntmc64(seq.get() + pos,
563 k,
564 hash_num,
565 forward_hash,
566 reverse_hash,
567 posN,
568 hashes_array.get()), )
569BTLLIB_NTHASH_ROLL(
570 BlindNtHash,
571 roll(char char_in),
572 {
573 ntmc64(seq[pos % seq_len],
574 char_in,
575 k,
576 hash_num,
577 forward_hash,
578 reverse_hash,
579 hashes_array.get());
580 seq[pos % seq_len] = char_in;
581 }, )
582BTLLIB_NTHASH_ROLL_BACK(
583 BlindNtHash,
584 roll_back(char char_in),
585 {
586 ntmc64l(seq[(pos + k - 1) % seq_len],
587 char_in,
588 k,
589 hash_num,
590 forward_hash,
591 reverse_hash,
592 hashes_array.get());
593 seq[(pos + k - 1) % seq_len] = char_in;
594 }, )
595BTLLIB_NTHASH_PEEK(
596 BlindNtHash,
597 peek(char char_in),
598
599 {
600 uint64_t forward_hash_tmp = forward_hash;
601 uint64_t reverse_hash_tmp = reverse_hash;
602 ntmc64(seq[pos % seq_len],
603 char_in,
604 k,
605 hash_num,
606 forward_hash_tmp,
607 reverse_hash_tmp,
608 hashes_array.get());
609 }, )
610BTLLIB_NTHASH_PEEK(
611 BlindNtHash,
612 peek_back(char char_in),
613 {
614 uint64_t forward_hash_tmp = forward_hash;
615 uint64_t reverse_hash_tmp = reverse_hash;
616 ntmc64l(seq[(pos + k - 1) % seq_len],
617 char_in,
618 k,
619 hash_num,
620 forward_hash_tmp,
621 reverse_hash_tmp,
622 hashes_array.get());
623 }, )
624
625BTLLIB_NTHASH_INIT(SeedNtHash,
626 ntmsm64(nthash.seq + nthash.pos,
627 blocks,
628 monomers,
629 nthash.k,
630 blocks.size(),
631 hash_num_per_seed,
632 fh_no_monomers.get(),
633 rh_no_monomers.get(),
634 forward_hash.get(),
635 reverse_hash.get(),
636 posN,
637 nthash.hashes_array.get()),
638 nthash.)
639BTLLIB_NTHASH_ROLL(SeedNtHash,
640 roll(),
641 ntmsm64(nthash.seq + nthash.pos,
642 blocks,
643 monomers,
644 nthash.k,
645 blocks.size(),
646 hash_num_per_seed,
647 fh_no_monomers.get(),
648 rh_no_monomers.get(),
649 forward_hash.get(),
650 reverse_hash.get(),
651 nthash.hashes_array.get());
652 , nthash.)
653BTLLIB_NTHASH_ROLL_BACK(SeedNtHash,
654 roll_back(),
655 ntmsm64l(nthash.seq + nthash.pos - 1,
656 blocks,
657 monomers,
658 nthash.k,
659 blocks.size(),
660 hash_num_per_seed,
661 fh_no_monomers.get(),
662 rh_no_monomers.get(),
663 forward_hash.get(),
664 reverse_hash.get(),
665 nthash.hashes_array.get());
666 , nthash.)
667BTLLIB_NTHASH_PEEK(
668 SeedNtHash,
669 peek(),
670
671 {
672 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(new uint64_t[blocks.size()]);
673 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(new uint64_t[blocks.size()]);
674 std::unique_ptr<uint64_t[]> forward_hash_tmp(new uint64_t[blocks.size()]);
675 std::unique_ptr<uint64_t[]> reverse_hash_tmp(new uint64_t[blocks.size()]);
676 std::memcpy(fh_no_monomers_tmp.get(),
677 forward_hash.get(),
678 blocks.size() * sizeof(uint64_t));
679 std::memcpy(rh_no_monomers_tmp.get(),
680 reverse_hash.get(),
681 blocks.size() * sizeof(uint64_t));
682 std::memcpy(forward_hash_tmp.get(),
683 forward_hash.get(),
684 blocks.size() * sizeof(uint64_t));
685 std::memcpy(reverse_hash_tmp.get(),
686 reverse_hash.get(),
687 blocks.size() * sizeof(uint64_t));
688 ntmsm64(nthash.seq + nthash.pos,
689 blocks,
690 monomers,
691 nthash.k,
692 blocks.size(),
693 hash_num_per_seed,
694 fh_no_monomers_tmp.get(),
695 rh_no_monomers_tmp.get(),
696 forward_hash_tmp.get(),
697 reverse_hash_tmp.get(),
698 nthash.hashes_array.get());
699 },
700 nthash.)
701BTLLIB_NTHASH_PEEK(
702 SeedNtHash,
703 peek(char char_in),
704
705 {
706 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(new uint64_t[blocks.size()]);
707 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(new uint64_t[blocks.size()]);
708 std::unique_ptr<uint64_t[]> forward_hash_tmp(new uint64_t[blocks.size()]);
709 std::unique_ptr<uint64_t[]> reverse_hash_tmp(new uint64_t[blocks.size()]);
710 std::memcpy(fh_no_monomers_tmp.get(),
711 forward_hash.get(),
712 blocks.size() * sizeof(uint64_t));
713 std::memcpy(rh_no_monomers_tmp.get(),
714 reverse_hash.get(),
715 blocks.size() * sizeof(uint64_t));
716 std::memcpy(forward_hash_tmp.get(),
717 forward_hash.get(),
718 blocks.size() * sizeof(uint64_t));
719 std::memcpy(reverse_hash_tmp.get(),
720 reverse_hash.get(),
721 blocks.size() * sizeof(uint64_t));
722 ntmsm64(nthash.seq + nthash.pos,
723 char_in,
724 blocks,
725 monomers,
726 nthash.k,
727 blocks.size(),
728 hash_num_per_seed,
729 fh_no_monomers_tmp.get(),
730 rh_no_monomers_tmp.get(),
731 forward_hash_tmp.get(),
732 reverse_hash_tmp.get(),
733 nthash.hashes_array.get());
734 },
735 nthash.)
736BTLLIB_NTHASH_PEEK(
737 SeedNtHash,
738 peek_back(),
739
740 {
741 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(new uint64_t[blocks.size()]);
742 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(new uint64_t[blocks.size()]);
743 std::unique_ptr<uint64_t[]> forward_hash_tmp(new uint64_t[blocks.size()]);
744 std::unique_ptr<uint64_t[]> reverse_hash_tmp(new uint64_t[blocks.size()]);
745 std::memcpy(fh_no_monomers_tmp.get(),
746 forward_hash.get(),
747 blocks.size() * sizeof(uint64_t));
748 std::memcpy(rh_no_monomers_tmp.get(),
749 reverse_hash.get(),
750 blocks.size() * sizeof(uint64_t));
751 std::memcpy(forward_hash_tmp.get(),
752 forward_hash.get(),
753 blocks.size() * sizeof(uint64_t));
754 std::memcpy(reverse_hash_tmp.get(),
755 reverse_hash.get(),
756 blocks.size() * sizeof(uint64_t));
757 ntmsm64l(nthash.seq + nthash.pos - 1,
758 blocks,
759 monomers,
760 nthash.k,
761 blocks.size(),
762 hash_num_per_seed,
763 fh_no_monomers_tmp.get(),
764 rh_no_monomers_tmp.get(),
765 forward_hash_tmp.get(),
766 reverse_hash_tmp.get(),
767 nthash.hashes_array.get());
768 },
769 nthash.)
770BTLLIB_NTHASH_PEEK(
771 SeedNtHash,
772 peek_back(char char_in),
773
774 {
775 std::unique_ptr<uint64_t[]> fh_no_monomers_tmp(new uint64_t[blocks.size()]);
776 std::unique_ptr<uint64_t[]> rh_no_monomers_tmp(new uint64_t[blocks.size()]);
777 std::unique_ptr<uint64_t[]> forward_hash_tmp(new uint64_t[blocks.size()]);
778 std::unique_ptr<uint64_t[]> reverse_hash_tmp(new uint64_t[blocks.size()]);
779 std::memcpy(fh_no_monomers_tmp.get(),
780 forward_hash.get(),
781 blocks.size() * sizeof(uint64_t));
782 std::memcpy(rh_no_monomers_tmp.get(),
783 reverse_hash.get(),
784 blocks.size() * sizeof(uint64_t));
785 std::memcpy(forward_hash_tmp.get(),
786 forward_hash.get(),
787 blocks.size() * sizeof(uint64_t));
788 std::memcpy(reverse_hash_tmp.get(),
789 reverse_hash.get(),
790 blocks.size() * sizeof(uint64_t));
791 ntmsm64l(nthash.seq + nthash.pos - 1,
792 char_in,
793 blocks,
794 monomers,
795 nthash.k,
796 blocks.size(),
797 hash_num_per_seed,
798 fh_no_monomers_tmp.get(),
799 rh_no_monomers_tmp.get(),
800 forward_hash_tmp.get(),
801 reverse_hash_tmp.get(),
802 nthash.hashes_array.get());
803 },
804 nthash.)
805
806#undef BTLLIB_NTHASH_INIT
807#undef BTLLIB_NTHASH_ROLL
808#undef BTLLIB_NTHASH_ROLL_BACK
809#undef BTLLIB_NTHASH_PEEK
810
811} // namespace btllib
812
813#endif
Definition: nthash.hpp:187
bool roll_back(char char_in)
BlindNtHash(const char *seq, size_t seq_len, unsigned hash_num, unsigned k, size_t pos=0)
bool peek(char char_in)
bool roll(char char_in)
size_t get_pos() const
Definition: nthash.hpp:259
BlindNtHash(const std::string &seq, unsigned hash_num, unsigned k, size_t pos=0)
bool peek_back(char char_in)
Definition: nthash.hpp:54
NtHash(const std::string &seq, unsigned hash_num, unsigned k, size_t pos=0)
bool peek(char char_in)
NtHash(const char *seq, size_t seq_len, unsigned hash_num, unsigned k, size_t pos=0)
bool peek_back(char char_in)
size_t get_pos() const
Definition: nthash.hpp:144
Definition: nthash.hpp:294
bool peek_back(char char_in)
bool peek(char char_in)
Definition: bloom_filter.hpp:16