btllib
nthash_lowlevel.hpp
1/*
2 * nthash_lowlevel.hpp
3 * Author: Hamid Mohamadi
4 * Genome Sciences Centre,
5 * British Columbia Cancer Agency
6 */
7
8#ifndef BTLLIB_NTHASH_LOWLEVEL_HPP
9#define BTLLIB_NTHASH_LOWLEVEL_HPP
10
11#include "btllib/nthash_consts.hpp"
12#include <array>
13#include <cstddef>
14#include <cstdint>
15#include <limits>
16#include <vector>
17
18namespace btllib {
19
20template<typename T>
21inline T
22canonical(const T fwd, const T rev)
23{
24 return fwd + rev;
25}
26
27static_assert(std::numeric_limits<unsigned>::max() + 1 == 0,
28 "Integers don't overflow on this platform which is necessary for "
29 "ntHash canonical hash computation.");
30
31// Data structures for spaced seeds
32
33// List of don't care positions
34using SpacedSeed = std::vector<unsigned>;
35
36// Bx2 array representing block start and end positions.
37using SpacedSeedBlocks = std::vector<std::array<unsigned, 2>>;
38
39// List of blocks with sizes of one.
40using SpacedSeedMonomers = std::vector<unsigned>;
41
50inline uint64_t
51srol(const uint64_t x)
52{
53 uint64_t m = ((x & 0x8000000000000000ULL) >> 30) | // NOLINT
54 ((x & 0x100000000ULL) >> 32); // NOLINT
55 return ((x << 1) & 0xFFFFFFFDFFFFFFFFULL) | m; // NOLINT
56}
57
67inline uint64_t
68srol(const uint64_t x, const unsigned d)
69{
70 uint64_t v = (x << d) | (x >> (64 - d)); // NOLINT
71 uint64_t y = (v ^ (v >> 33)) & // NOLINT
72 (std::numeric_limits<uint64_t>::max() >> (64 - d)); // NOLINT
73 return v ^ (y | (y << 33)); // NOLINT
74}
75
84inline uint64_t
85sror(const uint64_t x)
86{
87 uint64_t m = ((x & 0x200000000ULL) << 30) | ((x & 1ULL) << 32); // NOLINT
88 return ((x >> 1) & 0xFFFFFFFEFFFFFFFFULL) | m; // NOLINT
89}
90
99uint64_t
100ntf64(const char* kmer_seq, unsigned k);
101
111uint64_t
112ntr64(const char* kmer_seq, unsigned k);
113
125uint64_t
126ntf64(uint64_t fh_val,
127 unsigned k,
128 unsigned char char_out,
129 unsigned char char_in);
130
143uint64_t
144ntr64(uint64_t rh_val,
145 unsigned k,
146 unsigned char char_out,
147 unsigned char char_in);
148
157uint64_t
158ntc64(const char* kmer_seq, unsigned k);
159
171uint64_t
172ntc64(const char* kmer_seq, unsigned k, uint64_t& fh_val, uint64_t& rh_val);
173
185uint64_t
186ntc64(unsigned char char_out,
187 unsigned char char_in,
188 unsigned k,
189 uint64_t& fh_val,
190 uint64_t& rh_val);
191
202uint64_t
203ntf64l(uint64_t rh_val,
204 unsigned k,
205 unsigned char char_out,
206 unsigned char char_in);
207
218uint64_t
219ntr64l(uint64_t fh_val,
220 unsigned k,
221 unsigned char char_out,
222 unsigned char char_in);
223
236uint64_t
237ntc64l(unsigned char char_out,
238 unsigned char char_in,
239 unsigned k,
240 uint64_t& fh_val,
241 uint64_t& rh_val);
242
251void
252nte64(uint64_t bh_val, unsigned k, unsigned h, uint64_t* h_val);
253
262void
263ntmc64(const char* kmer_seq, unsigned k, unsigned m, uint64_t* h_val);
264
276void
277ntmc64(const char* kmer_seq,
278 unsigned k,
279 unsigned m,
280 uint64_t& fh_val,
281 uint64_t& rh_val,
282 uint64_t* h_val);
283
295void
296ntmc64(unsigned char char_out,
297 unsigned char char_in,
298 unsigned k,
299 unsigned m,
300 uint64_t& fh_val,
301 uint64_t& rh_val,
302 uint64_t* h_val);
303
315void
316ntmc64l(unsigned char char_out,
317 unsigned char char_in,
318 unsigned k,
319 unsigned m,
320 uint64_t& fh_val,
321 uint64_t& rh_val,
322 uint64_t* h_val);
323
336bool
337ntc64(const char* kmer_seq, unsigned k, uint64_t& h_val, unsigned& loc_n);
338
352bool
353ntmc64(const char* kmer_seq,
354 unsigned k,
355 unsigned m,
356 unsigned& loc_n,
357 uint64_t* h_val);
358
373bool
374ntc64(const char* kmer_seq,
375 unsigned k,
376 uint64_t& fh_val,
377 uint64_t& rh_val,
378 uint64_t& h_val,
379 unsigned& loc_n);
380
396bool
397ntmc64(const char* kmer_seq,
398 unsigned k,
399 unsigned m,
400 uint64_t& fh_val,
401 uint64_t& rh_val,
402 unsigned& loc_n,
403 uint64_t* h_val);
404
422bool
423ntmc64(const char* kmer_seq,
424 unsigned k,
425 unsigned m,
426 uint64_t& fh_val,
427 uint64_t& rh_val,
428 unsigned& loc_n,
429 uint64_t* h_val,
430 bool& h_stn);
431
445void
446ntmc64(unsigned char char_out,
447 unsigned char char_in,
448 unsigned k,
449 unsigned m,
450 uint64_t& fh_val,
451 uint64_t& rh_val,
452 uint64_t* h_val,
453 bool& h_stn);
454
468uint64_t
469mask_hash(uint64_t& fk_val,
470 uint64_t& rk_val,
471 const char* seed_seq,
472 const char* kmer_seq,
473 unsigned k);
474
489void
490sub_hash(uint64_t fh_val,
491 uint64_t rh_val,
492 const char* kmer_seq,
493 const std::vector<unsigned>& positions,
494 const std::vector<unsigned char>& new_bases,
495 unsigned k,
496 unsigned m,
497 uint64_t* h_val);
498
523bool
524ntmsm64(const char* kmer_seq,
525 const std::vector<SpacedSeedBlocks>& seeds_blocks,
526 const std::vector<SpacedSeedMonomers>& seeds_monomers,
527 unsigned k,
528 unsigned m,
529 unsigned m2,
530 uint64_t* fh_nomonos,
531 uint64_t* rh_nomonos,
532 uint64_t* fh_val,
533 uint64_t* rh_val,
534 unsigned& loc_n,
535 uint64_t* h_val);
536
537#define NTMSM64(ROL_HANDLING, IN_HANDLING, OUT_HANDLING, ROR_HANDLING) \
538 unsigned char char_out, char_in; \
539 uint64_t fh_seed, rh_seed; \
540 unsigned i_out, i_in, i_base; \
541 for (unsigned i_seed = 0; i_seed < m; i_seed++) { \
542 ROL_HANDLING /* NOLINT(bugprone-macro-parentheses) */ \
543 for (const auto& block : seeds_blocks[i_seed]) \
544 { \
545 IN_HANDLING \
546 OUT_HANDLING \
547 fh_seed ^= MS_TAB(char_out, k - i_out); \
548 fh_seed ^= MS_TAB(char_in, k - i_in); \
549 rh_seed ^= MS_TAB(char_out & CP_OFF, i_out); \
550 rh_seed ^= MS_TAB(char_in & CP_OFF, i_in); \
551 } \
552 ROR_HANDLING /* NOLINT(bugprone-macro-parentheses) */ \
553 fh_nomonos[i_seed] = fh_seed; \
554 rh_nomonos[i_seed] = rh_seed; \
555 for (const auto& pos : seeds_monomers[i_seed]) { \
556 fh_seed ^= MS_TAB((unsigned char)kmer_seq[pos + 1], k - 1 - pos); \
557 rh_seed ^= MS_TAB((unsigned char)kmer_seq[pos + 1] & CP_OFF, pos); \
558 } \
559 fh_val[i_seed] = fh_seed; \
560 rh_val[i_seed] = rh_seed; \
561 i_base = i_seed * m2; \
562 h_val[i_base] = canonical(fh_seed, rh_seed); \
563 for (unsigned i_hash = 1; i_hash < m2; i_hash++) { \
564 h_val[i_base + i_hash] = h_val[i_base] * (i_hash ^ k * MULTISEED); \
565 h_val[i_base + i_hash] ^= h_val[i_base + i_hash] >> MULTISHIFT; \
566 } \
567 }
568
590void
591ntmsm64(const char* kmer_seq,
592 const std::vector<SpacedSeedBlocks>& seeds_blocks,
593 const std::vector<SpacedSeedMonomers>& seeds_monomers,
594 unsigned k,
595 unsigned m,
596 unsigned m2,
597 uint64_t* fh_nomonos,
598 uint64_t* rh_nomonos,
599 uint64_t* fh_val,
600 uint64_t* rh_val,
601 uint64_t* h_val);
602
624void
625ntmsm64l(const char* kmer_seq,
626 const std::vector<SpacedSeedBlocks>& seeds_blocks,
627 const std::vector<SpacedSeedMonomers>& seeds_monomers,
628 unsigned k,
629 unsigned m,
630 unsigned m2,
631 uint64_t* fh_nomonos,
632 uint64_t* rh_nomonos,
633 uint64_t* fh_val,
634 uint64_t* rh_val,
635 uint64_t* h_val);
636
658void
659ntmsm64(const char* kmer_seq,
660 char in,
661 const std::vector<SpacedSeedBlocks>& seeds_blocks,
662 const std::vector<SpacedSeedMonomers>& seeds_monomers,
663 unsigned k,
664 unsigned m,
665 unsigned m2,
666 uint64_t* fh_nomonos,
667 uint64_t* rh_nomonos,
668 uint64_t* fh_val,
669 uint64_t* rh_val,
670 uint64_t* h_val);
671
693void
694ntmsm64l(const char* kmer_seq,
695 char in,
696 const std::vector<SpacedSeedBlocks>& seeds_blocks,
697 const std::vector<SpacedSeedMonomers>& seeds_monomers,
698 unsigned k,
699 unsigned m,
700 unsigned m2,
701 uint64_t* fh_nomonos,
702 uint64_t* rh_nomonos,
703 uint64_t* fh_val,
704 uint64_t* rh_val,
705 uint64_t* h_val);
706
707} // namespace btllib
708
709#endif
Definition: bloom_filter.hpp:16
uint64_t ntf64l(uint64_t rh_val, unsigned k, unsigned char char_out, unsigned char char_in)
void sub_hash(uint64_t fh_val, uint64_t rh_val, const char *kmer_seq, const std::vector< unsigned > &positions, const std::vector< unsigned char > &new_bases, unsigned k, unsigned m, uint64_t *h_val)
uint64_t ntr64l(uint64_t fh_val, unsigned k, unsigned char char_out, unsigned char char_in)
uint64_t srol(const uint64_t x)
Definition: nthash_lowlevel.hpp:51
uint64_t mask_hash(uint64_t &fk_val, uint64_t &rk_val, const char *seed_seq, const char *kmer_seq, unsigned k)
uint64_t ntc64l(unsigned char char_out, unsigned char char_in, unsigned k, uint64_t &fh_val, uint64_t &rh_val)
uint64_t sror(const uint64_t x)
Definition: nthash_lowlevel.hpp:85
uint64_t ntc64(const char *kmer_seq, unsigned k)
uint64_t ntr64(const char *kmer_seq, unsigned k)
void nte64(uint64_t bh_val, unsigned k, unsigned h, uint64_t *h_val)
uint64_t ntf64(const char *kmer_seq, unsigned k)