btllib
rolling_hash.hpp
1 #ifndef BTLLIB_ROLLING_HASH_HPP
2 #define BTLLIB_ROLLING_HASH_HPP
3 
4 #include "nthash.hpp"
5 
6 #include <limits>
7 #include <string>
8 #include <vector>
9 
10 namespace btllib {
11 
12 class RollingHash;
13 class SeedRollingHash;
14 using SpacedSeed = std::vector<unsigned>;
15 static std::vector<SpacedSeed>
16 parse_seeds(const std::vector<std::string>& seed_strings);
17 
27 {
28 
29 public:
37  RollingHash(const char* seq, size_t seq_len, unsigned k, unsigned hash_num);
38 
45  RollingHash(const std::string& seq, unsigned k, unsigned hash_num);
46 
47  bool roll();
48 
49  const uint64_t* hashes() const;
50 
51  size_t get_pos() const { return pos; }
52  unsigned get_k() const { return k; }
53  unsigned get_hash_num() const { return hash_num; }
54 
55 protected:
57  bool init();
58 
59  const char* seq;
60  const size_t seq_len;
61  const unsigned k;
62  const unsigned hash_num;
63  size_t pos = 0;
64  std::vector<uint64_t> hashes_vector;
65  uint64_t forward_hash = 0;
66  uint64_t reverse_hash = 0;
67 };
68 
70 {
71 
72 public:
73  SeedRollingHash(const char* seq,
74  size_t seq_len,
75  unsigned k,
76  const std::vector<SpacedSeed>& seeds,
77  unsigned hash_num_per_seed);
78  SeedRollingHash(const std::string& seq,
79  unsigned k,
80  const std::vector<SpacedSeed>& seeds,
81  unsigned hash_num_per_seed);
82  SeedRollingHash(const char* seq,
83  size_t seq_len,
84  unsigned k,
85  const std::vector<std::string>& seeds,
86  unsigned hash_num_per_seed);
87  SeedRollingHash(const std::string& seq,
88  unsigned k,
89  const std::vector<std::string>& seeds,
90  unsigned hash_num_per_seed);
91 
92  unsigned get_hash_num_per_seed() const { return hash_num_per_seed; }
93 
94  bool roll();
95 
96 private:
97  bool init();
98 
99  const unsigned hash_num_per_seed;
100  std::vector<SpacedSeed> seeds;
101 };
102 
103 inline RollingHash::RollingHash(const char* seq,
104  size_t seq_len,
105  unsigned k,
106  unsigned hash_num)
107  : seq(seq)
108  , seq_len(seq_len)
109  , k(k)
110  , hash_num(hash_num)
111 {
112  hashes_vector.resize(hash_num);
113 }
114 
115 inline RollingHash::RollingHash(const std::string& seq,
116  unsigned k,
117  unsigned hash_num)
118  : RollingHash(seq.c_str(), seq.size(), k, hash_num)
119 {}
120 
121 inline SeedRollingHash::SeedRollingHash(const char* seq,
122  size_t seq_len,
123  unsigned k,
124  const std::vector<SpacedSeed>& seeds,
125  unsigned hash_num_per_seed)
126  : RollingHash(seq, seq_len, k, seeds.size() * hash_num_per_seed)
127  , hash_num_per_seed(hash_num_per_seed)
128  , seeds(seeds)
129 {}
130 
131 inline SeedRollingHash::SeedRollingHash(const std::string& seq,
132  unsigned k,
133  const std::vector<SpacedSeed>& seeds,
134  unsigned hash_num_per_seed)
135  : RollingHash(seq, k, seeds.size() * hash_num_per_seed)
136  , hash_num_per_seed(hash_num_per_seed)
137  , seeds(seeds)
138 {}
139 
140 inline SeedRollingHash::SeedRollingHash(const char* seq,
141  size_t seq_len,
142  unsigned k,
143  const std::vector<std::string>& seeds,
144  unsigned hash_num_per_seed)
145  : RollingHash(seq, seq_len, k, seeds.size() * hash_num_per_seed)
146  , hash_num_per_seed(hash_num_per_seed)
147  , seeds(parse_seeds(seeds))
148 {}
149 
150 inline SeedRollingHash::SeedRollingHash(const std::string& seq,
151  unsigned k,
152  const std::vector<std::string>& seeds,
153  unsigned hash_num_per_seed)
154  : RollingHash(seq, k, seeds.size() * hash_num_per_seed)
155  , hash_num_per_seed(hash_num_per_seed)
156  , seeds(parse_seeds(seeds))
157 {}
158 
159 static std::vector<SpacedSeed>
160 parse_seeds(const std::vector<std::string>& seed_strings)
161 {
162  std::vector<SpacedSeed> seed_set;
163  for (const auto& seed_string : seed_strings) {
164  SpacedSeed seed;
165  size_t pos = 0;
166  for (const auto& c : seed_string) {
167  if (c != '1') {
168  seed.push_back(pos);
169  }
170  ++pos;
171  }
172  seed_set.push_back(seed);
173  }
174  return seed_set;
175 }
176 
177 // NOLINTNEXTLINE
178 #define ROLLING_HASH_INIT(CLASS, NTHASH_CALL) \
179  inline bool CLASS::init() \
180  { \
181  if (k > seq_len) { \
182  pos = std::numeric_limits<std::size_t>::max(); \
183  return false; \
184  } \
185  unsigned posN = 0; \
186  while ((pos < seq_len - k + 1) && !(NTHASH_CALL)) { \
187  pos += posN + 1; \
188  } \
189  if (pos > seq_len - k) { \
190  pos = std::numeric_limits<std::size_t>::max(); \
191  return false; \
192  } \
193  ++pos; \
194  return true; \
195  }
196 
197 // NOLINTNEXTLINE
198 #define ROLLING_HASH_ROLL(CLASS, NTHASH_CALL) \
199  inline bool CLASS::roll() \
200  { \
201  if (pos == 0) { \
202  return init(); \
203  } \
204  if (pos > seq_len - k) { \
205  return false; \
206  } \
207  if (seed_tab[(unsigned char)(seq[pos + k - 1])] == seedN) { \
208  pos += k; \
209  return init(); \
210  } \
211  (NTHASH_CALL); \
212  ++pos; \
213  return true; \
214  }
215 
216 ROLLING_HASH_INIT(RollingHash,
217  NTMC64(seq + pos,
218  k,
219  hash_num,
220  forward_hash,
221  reverse_hash,
222  posN,
223  hashes_vector.data()))
224 ROLLING_HASH_ROLL(RollingHash,
225  NTMC64(seq[pos - 1],
226  seq[pos - 1 + k],
227  k,
228  hash_num,
229  forward_hash,
230  reverse_hash,
231  hashes_vector.data()))
232 
233 ROLLING_HASH_INIT(SeedRollingHash,
234  NTMSM64(seq + pos,
235  seeds,
236  k,
237  seeds.size(),
238  hash_num_per_seed,
239  forward_hash,
240  reverse_hash,
241  posN,
242  hashes_vector.data()))
243 ROLLING_HASH_ROLL(SeedRollingHash,
244  NTMSM64(seq + pos,
245  seeds,
246  seq[pos - 1],
247  seq[pos - 1 + k],
248  k,
249  seeds.size(),
250  hash_num_per_seed,
251  forward_hash,
252  reverse_hash,
253  hashes_vector.data()))
254 
255 #undef ROLLING_HASH_INIT
256 #undef ROLLING_HASH_ROLL
257 
258 inline const uint64_t*
259 RollingHash::hashes() const
260 {
261  return hashes_vector.data();
262 }
263 
264 } // namespace btllib
265 
266 #endif
btllib::RollingHash::RollingHash
RollingHash(const char *seq, size_t seq_len, unsigned k, unsigned hash_num)
Definition: rolling_hash.hpp:103
btllib::SeedRollingHash
Definition: rolling_hash.hpp:69
btllib::RollingHash::init
bool init()
btllib::RollingHash
Definition: rolling_hash.hpp:26