btllib
indexlr.hpp
1#ifndef BTLLIB_INDEXLR_HPP
2#define BTLLIB_INDEXLR_HPP
3
4#include "btllib/bloom_filter.hpp"
5#include "btllib/nthash.hpp"
6#include "btllib/order_queue.hpp"
7#include "btllib/seq_reader.hpp"
8#include "btllib/status.hpp"
9#include "btllib/util.hpp"
10
11#include <algorithm>
12#include <atomic>
13#include <cstdlib>
14#include <cstring>
15#include <functional>
16#include <iostream>
17#include <limits>
18#include <memory>
19#include <string>
20#include <thread>
21#include <vector>
22
23namespace btllib {
24
26{
27
28public:
29 /* Has to be a struct and not an enum because:
30 * 1) Non-class enums are not name qualified and can collide
31 * 2) class enums can't be implicitly converted into integers
32 */
33 struct Flag
34 {
36 static const unsigned NO_ID = 1;
38 static const unsigned BX = 2;
40 static const unsigned SEQ = 4;
43 static const unsigned FILTER_IN = 8;
48 static const unsigned FILTER_OUT = 16;
50 static const unsigned SHORT_MODE = 32;
52 static const unsigned LONG_MODE = 64;
53 };
54
55 bool output_id() const { return bool(~flags & Flag::NO_ID); }
56 bool output_bx() const { return bool(flags & Flag::BX); }
57 bool output_seq() const { return bool(flags & Flag::SEQ); }
58 bool filter_in() const { return bool(flags & Flag::FILTER_IN); }
59 bool filter_out() const { return bool(flags & Flag::FILTER_OUT); }
60 bool short_mode() const { return bool(flags & Flag::SHORT_MODE); }
61 bool long_mode() const { return bool(flags & Flag::LONG_MODE); }
62
63 struct Minimizer
64 {
65 Minimizer() = default;
66
67 Minimizer(uint64_t min_hash,
68 uint64_t out_hash,
69 size_t pos,
70 bool forward,
71 std::string seq)
72 : min_hash(min_hash)
73 , out_hash(out_hash)
74 , pos(pos)
75 , forward(forward)
76 , seq(std::move(seq))
77 {}
78
79 uint64_t min_hash = 0, out_hash = 0;
80 size_t pos = 0;
81 bool forward = false;
82 std::string seq;
83 };
84
85 using HashedKmer = Minimizer;
86
87 struct Record
88 {
89 Record() {}
90
91 Record(size_t num,
92 std::string id,
93 std::string barcode,
94 size_t readlen,
95 std::vector<Minimizer> minimizers)
96 : num(num)
97 , id(std::move(id))
98 , barcode(std::move(barcode))
99 , readlen(readlen)
100 , minimizers(std::move(minimizers))
101 {}
102
103 size_t num = 0;
104 std::string id;
105 std::string barcode;
106 size_t readlen = 0;
107 std::vector<Minimizer> minimizers;
108
109 operator bool() const
110 {
111 return !id.empty() || !barcode.empty() || !minimizers.empty();
112 }
113 };
114
115 Record read();
116
135 Indexlr(std::string seqfile,
136 size_t k,
137 size_t w,
138 unsigned flags = 0,
139 unsigned threads = 5,
140 bool verbose = false,
141 const btllib::BloomFilter& bf1 = Indexlr::dummy_bf(),
142 const btllib::BloomFilter& bf2 = Indexlr::dummy_bf());
143
144 ~Indexlr();
145
146 void close() noexcept;
147
148 static const size_t MAX_SIMULTANEOUS_INDEXLRS = 256;
149
152 class RecordIterator
153 {
154 public:
155 void operator++() { record = indexlr.read(); }
156 bool operator!=(const RecordIterator& i)
157 {
158 return bool(record) || bool(i.record);
159 }
160 Record operator*() { return std::move(record); }
161 // For wrappers
162 Record next()
163 {
164 auto val = operator*();
165 operator++();
166 return val;
167 }
168
169 private:
170 friend Indexlr;
171
172 RecordIterator(Indexlr& indexlr, bool end)
173 : indexlr(indexlr)
174 {
175 if (!end) {
176 operator++();
177 }
178 }
179
180 Indexlr& indexlr;
181 Record record;
182 };
184
185 RecordIterator begin() { return RecordIterator(*this, false); }
186 RecordIterator end() { return RecordIterator(*this, true); }
187
188private:
189 static std::string extract_barcode(const std::string& id,
190 const std::string& comment);
191 static void filter_hashed_kmer(Indexlr::HashedKmer& hk,
192 bool filter_in,
193 bool filter_out,
194 const BloomFilter& filter_in_bf,
195 const BloomFilter& filter_out_bf);
196 static void calc_minimizer(
197 const std::vector<Indexlr::HashedKmer>& hashed_kmers_buffer,
198 const Indexlr::Minimizer*& min_current,
199 size_t idx,
200 ssize_t& min_idx_left,
201 ssize_t& min_idx_right,
202 ssize_t& min_pos_prev,
203 size_t w,
204 std::vector<Indexlr::Minimizer>& minimizers);
205 std::vector<Minimizer> minimize(const std::string& seq) const;
206
207 const std::string seqfile;
208 const size_t k, w;
209 const unsigned flags;
210 const bool verbose;
211 const long id;
212 std::atomic<bool> closed{ false };
213
214 static const BloomFilter& dummy_bf()
215 {
216 static const BloomFilter var;
217 return var;
218 }
219
220 const std::reference_wrapper<const BloomFilter> filter_in_bf;
221 const std::reference_wrapper<const BloomFilter> filter_out_bf;
222 bool filter_in_enabled;
223 bool filter_out_enabled;
224
225 SeqReader reader;
226 OrderQueueMPSC<Record> output_queue;
227
228 using OutputQueueType = decltype(output_queue);
229 static std::unique_ptr<OutputQueueType::Block>* ready_blocks_array()
230 {
231 thread_local static std::unique_ptr<decltype(output_queue)::Block>
232 var[MAX_SIMULTANEOUS_INDEXLRS];
233 return var;
234 }
235
236 static long* ready_blocks_owners()
237 {
238 thread_local static long var[MAX_SIMULTANEOUS_INDEXLRS] = { 0 };
239 return var;
240 }
241
242 static size_t* ready_blocks_current()
243 {
244 thread_local static size_t var[MAX_SIMULTANEOUS_INDEXLRS] = { 0 };
245 return var;
246 }
247
248 static std::atomic<long>& last_id()
249 {
250 static std::atomic<long> var(0);
251 return var;
252 }
253
254 class Worker
255 {
256 public:
257 void start() { t = std::thread(do_work, this); }
258 void join() { t.join(); }
259 void set_id(const int id) { this->id = id; }
260
261 Worker& operator=(const Worker& worker) = delete;
262 Worker& operator=(Worker&& worker) = delete;
263
264 Worker(Indexlr& indexlr)
265 : indexlr(indexlr)
266 {}
267 Worker(const Worker& worker)
268 : Worker(worker.indexlr)
269 {}
270 Worker(Worker&& worker) noexcept
271 : Worker(worker.indexlr)
272 {}
273
274 private:
275 void work();
276 static void do_work(Worker* worker) { worker->work(); }
277
278 int id = -1;
279 Indexlr& indexlr;
280 std::thread t;
281 };
282
283 std::vector<Worker> workers;
284 Barrier end_barrier;
285 std::mutex last_block_num_mutex;
286 uint64_t last_block_num = 0;
287 bool last_block_num_valid = false;
288};
289
290inline Indexlr::Indexlr(std::string seqfile,
291 const size_t k,
292 const size_t w,
293 const unsigned flags,
294 const unsigned threads,
295 const bool verbose,
296 const BloomFilter& bf1,
297 const BloomFilter& bf2)
298 : seqfile(std::move(seqfile))
299 , k(k)
300 , w(w)
301 , flags(flags)
302 , verbose(verbose)
303 , id(++last_id())
304 , filter_in_bf(filter_in() ? bf1 : Indexlr::dummy_bf())
305 , filter_out_bf(filter_out() ? filter_in() ? bf2 : bf1 : Indexlr::dummy_bf())
306 , filter_in_enabled(filter_in())
307 , filter_out_enabled(filter_out())
308 , reader(this->seqfile,
309 short_mode() ? SeqReader::Flag::SHORT_MODE
310 : SeqReader::Flag::LONG_MODE)
311 , output_queue(reader.get_buffer_size(), reader.get_block_size())
312 , workers(std::vector<Worker>(threads, Worker(*this)))
313 , end_barrier(threads)
314{
315 check_error(!short_mode() && !long_mode(),
316 "Indexlr: no mode selected, either short or long mode flag must "
317 "be provided.");
318 check_error(short_mode() && long_mode(),
319 "Indexlr: short and long mode are mutually exclusive.");
320 check_error(threads == 0,
321 "Indexlr: Number of processing threads cannot be 0.");
322 int id_counter = 0;
323 for (auto& worker : workers) {
324 worker.set_id(id_counter++);
325 worker.start();
326 }
327}
328
329inline Indexlr::~Indexlr()
330{
331 close();
332}
333
334inline void
335Indexlr::close() noexcept
336{
337 bool closed_expected = false;
338 if (closed.compare_exchange_strong(closed_expected, true)) {
339 try {
340 reader.close();
341 output_queue.close();
342 for (auto& worker : workers) {
343 worker.join();
344 }
345 } catch (const std::system_error& e) {
346 log_error("Indexlr thread join failure: " + std::string(e.what()));
347 std::exit(EXIT_FAILURE); // NOLINT(concurrency-mt-unsafe)
348 }
349 }
350}
351
352// Minimerize a sequence: Find the minimizers of a vector of hash values
353// representing a sequence.
354/* Algorithm
355v is a vector of non-negative integers
356w is the window size
357Invariants
358 0 < w <= v.size() - 1
359 0 <= l <= r <= v.size() - 1
360Initial conditions
361 M = NIL Final set of minimizers, empty initially
362 min = -1 Minimum element
363 i = -1 Index of minimum element
364 prev = -1 Index of previous minimum element
365 l = 0 Index of left end of window
366 r = l + w - 1 Index of right end of window
367Computation
368At each window, if the previous minimum is out of scope, find the new,
369right-most, minimum or else, check with only the right-most element to determine
370if that is the new minimum. A minimizer is added to the final vector only if
371it's index has changed. for each window of v bounded by [l, r] if (i < l) i =
372index of minimum element in [l, r], furthest from l. else if (v[r] <= v[i]) i =
373r min = v[i] if (i != prev) { prev = i M <- M + m
374 }
375 l = l + 1 Move window's left bound by one element
376 r = l + w - 1 Set window's right bound
377}*/
378
379inline std::string
380Indexlr::extract_barcode(const std::string& id, const std::string& comment)
381{
382 const static std::string barcode_prefix = "BX:Z:";
383 if (startswith(comment, barcode_prefix)) {
384 const auto space_pos = comment.find(' ');
385 if (space_pos != std::string::npos) {
386 return comment.substr(barcode_prefix.size(),
387 space_pos - barcode_prefix.size());
388 }
389 return comment.substr(barcode_prefix.size());
390 }
391 const auto pound_pos = id.find('#');
392 if (pound_pos != std::string::npos) {
393 const auto slash_pos = id.find('/');
394 if (slash_pos > pound_pos) {
395 return id.substr(pound_pos + 1, slash_pos - (pound_pos + 1));
396 }
397 }
398 return "NA";
399}
400
401inline void
402Indexlr::filter_hashed_kmer(Indexlr::HashedKmer& hk,
403 bool filter_in,
404 bool filter_out,
405 const BloomFilter& filter_in_bf,
406 const BloomFilter& filter_out_bf)
407{
408 if (filter_in && filter_out) {
409 std::vector<uint64_t> tmp;
410 tmp = { hk.min_hash };
411 if (!filter_in_bf.contains(tmp) || filter_out_bf.contains(tmp)) {
412 hk.min_hash = std::numeric_limits<uint64_t>::max();
413 }
414 } else if (filter_in) {
415 if (!filter_in_bf.contains({ hk.min_hash })) {
416 hk.min_hash = std::numeric_limits<uint64_t>::max();
417 }
418 } else if (filter_out) {
419 if (filter_out_bf.contains({ hk.min_hash })) {
420 hk.min_hash = std::numeric_limits<uint64_t>::max();
421 }
422 }
423}
424
425inline void
426Indexlr::calc_minimizer(
427 const std::vector<Indexlr::HashedKmer>& hashed_kmers_buffer,
428 const Indexlr::Minimizer*& min_current,
429 const size_t idx,
430 ssize_t& min_idx_left,
431 ssize_t& min_idx_right,
432 ssize_t& min_pos_prev,
433 const size_t w,
434 std::vector<Indexlr::Minimizer>& minimizers)
435{
436 min_idx_left = ssize_t(idx + 1 - w);
437 min_idx_right = ssize_t(idx + 1);
438 const auto& min_left =
439 hashed_kmers_buffer[min_idx_left % hashed_kmers_buffer.size()];
440 const auto& min_right =
441 hashed_kmers_buffer[(min_idx_right - 1) % hashed_kmers_buffer.size()];
442
443 if (min_current == nullptr || min_current->pos < min_left.pos) {
444 min_current = &min_left;
445 // Use of operator '<=' returns the minimum that is furthest from left.
446 for (ssize_t i = min_idx_left; i < min_idx_right; i++) {
447 const auto& min_i = hashed_kmers_buffer[i % hashed_kmers_buffer.size()];
448 if (min_i.min_hash <= min_current->min_hash) {
449 min_current = &min_i;
450 }
451 }
452 } else if (min_right.min_hash <= min_current->min_hash) {
453 min_current = &min_right;
454 }
455 if (ssize_t(min_current->pos) > min_pos_prev &&
456 min_current->min_hash != std::numeric_limits<uint64_t>::max()) {
457 min_pos_prev = ssize_t(min_current->pos);
458 minimizers.push_back(*min_current);
459 }
460}
461
462inline std::vector<Indexlr::Minimizer>
463Indexlr::minimize(const std::string& seq) const
464{
465 if ((k > seq.size()) || (w > seq.size() - k + 1)) {
466 return {};
467 }
468 std::vector<Minimizer> minimizers;
469 minimizers.reserve(2 * (seq.size() - k + 1) / w);
470 std::vector<HashedKmer> hashed_kmers_buffer(w + 1);
471 ssize_t min_idx_left, min_idx_right, min_pos_prev = -1;
472 const Minimizer* min_current = nullptr;
473 size_t idx = 0;
474 for (NtHash nh(seq, 2, k); nh.roll(); ++idx) {
475 auto& hk = hashed_kmers_buffer[idx % hashed_kmers_buffer.size()];
476
477 hk = HashedKmer(nh.hashes()[0],
478 nh.hashes()[1],
479 nh.get_pos(),
480 nh.forward(),
481 output_seq() ? seq.substr(nh.get_pos(), k) : "");
482
483 filter_hashed_kmer(
484 hk, filter_in(), filter_out(), filter_in_bf.get(), filter_out_bf.get());
485
486 if (idx + 1 >= w) {
487 calc_minimizer(hashed_kmers_buffer,
488 min_current,
489 idx,
490 min_idx_left,
491 min_idx_right,
492 min_pos_prev,
493 w,
494 minimizers);
495 }
496 }
497 return minimizers;
498}
499
500inline Indexlr::Record
501Indexlr::read()
502{
503 if (ready_blocks_owners()[id % MAX_SIMULTANEOUS_INDEXLRS] != id) {
504 ready_blocks_array()[id % MAX_SIMULTANEOUS_INDEXLRS] =
505 std::unique_ptr<decltype(output_queue)::Block>(
506 new decltype(output_queue)::Block(reader.get_block_size()));
507 ready_blocks_owners()[id % MAX_SIMULTANEOUS_INDEXLRS] = id;
508 ready_blocks_current()[id % MAX_SIMULTANEOUS_INDEXLRS] = 0;
509 }
510 auto& block = *(ready_blocks_array()[id % MAX_SIMULTANEOUS_INDEXLRS]);
511 auto& current = ready_blocks_current()[id % MAX_SIMULTANEOUS_INDEXLRS];
512 if (current >= block.count) {
513 block.count = 0;
514 output_queue.read(block);
515 if (block.count == 0) {
516 output_queue.close();
517 block = decltype(output_queue)::Block(reader.get_block_size());
518 return Record();
519 }
520 current = 0;
521 }
522 return std::move(block.data[current++]);
523}
524
525inline void
526Indexlr::Worker::work()
527{
528 decltype(indexlr.output_queue)::Block output_block(
529 indexlr.reader.get_block_size());
530 uint64_t last_block_num = 0;
531 bool last_block_num_valid = false;
532 for (;;) {
533 auto input_block = indexlr.reader.read_block();
534 if (input_block.count == 0) {
535 break;
536 }
537
538 output_block.num = input_block.num;
539 for (size_t idx = 0; idx < input_block.count; idx++) {
540 Record record;
541 auto& reader_record = input_block.data[idx];
542 record.num = reader_record.num;
543 if (indexlr.output_id()) {
544 record.id = std::move(reader_record.id);
545 }
546 if (indexlr.output_bx()) {
547 record.barcode =
548 indexlr.extract_barcode(record.id, reader_record.comment);
549 }
550 record.readlen = reader_record.seq.size();
551
552 check_info(indexlr.verbose && indexlr.k > record.readlen,
553 "Indexlr: skipped seq " + std::to_string(record.num) +
554 " on line " +
555 std::to_string(record.num * (indexlr.reader.get_format() ==
556 SeqReader::Format::FASTA
557 ? 2
558 : 4) +
559 2) +
560 "; k (" + std::to_string(indexlr.k) + ") > seq length (" +
561 std::to_string(record.readlen) + ")");
562
563 check_info(indexlr.verbose && indexlr.w > record.readlen - indexlr.k + 1,
564 "Indexlr: skipped seq " + std::to_string(record.num) +
565 " on line " +
566 std::to_string(record.num * (indexlr.reader.get_format() ==
567 SeqReader::Format::FASTA
568 ? 2
569 : 4) +
570 2) +
571 "; w (" + std::to_string(indexlr.w) + ") > # of hashes (" +
572 std::to_string(record.readlen - indexlr.k + 1) + ")");
573
574 if (indexlr.k <= record.readlen &&
575 indexlr.w <= record.readlen - indexlr.k + 1) {
576 record.minimizers = indexlr.minimize(reader_record.seq);
577 } else {
578 record.minimizers = {};
579 }
580
581 output_block.data[output_block.count++] = std::move(record);
582 }
583 if (output_block.count > 0) {
584 last_block_num = output_block.num;
585 last_block_num_valid = true;
586 indexlr.output_queue.write(output_block);
587 output_block.count = 0;
588 }
589 }
590 if (last_block_num_valid) {
591 std::unique_lock<std::mutex> lock(indexlr.last_block_num_mutex);
592 indexlr.last_block_num = std::max(indexlr.last_block_num, last_block_num);
593 indexlr.last_block_num_valid = true;
594 lock.unlock();
595 }
596 indexlr.end_barrier.wait();
597 if (last_block_num_valid && indexlr.last_block_num_valid &&
598 last_block_num == indexlr.last_block_num) {
599 output_block.num = last_block_num + 1;
600 indexlr.output_queue.write(output_block);
601 } else if (!indexlr.last_block_num_valid && id == 0) {
602 output_block.num = 0;
603 indexlr.output_queue.write(output_block);
604 }
605}
606
607} // namespace btllib
608
609#endif
Definition: bloom_filter.hpp:67
Definition: indexlr.hpp:26
Indexlr(std::string seqfile, size_t k, size_t w, unsigned flags=0, unsigned threads=5, bool verbose=false, const btllib::BloomFilter &bf1=Indexlr::dummy_bf(), const btllib::BloomFilter &bf2=Indexlr::dummy_bf())
Definition: indexlr.hpp:290
RecordIterator begin()
Definition: indexlr.hpp:185
Definition: seq_reader.hpp:43
Definition: bloom_filter.hpp:16
std::string join(const std::vector< std::string > &s, const std::string &delim)
void check_error(bool condition, const std::string &msg)
void log_error(const std::string &msg)
bool startswith(std::string s, std::string prefix)
void check_info(bool condition, const std::string &msg)
Definition: indexlr.hpp:34
static const unsigned BX
Definition: indexlr.hpp:38
static const unsigned SEQ
Definition: indexlr.hpp:40
static const unsigned LONG_MODE
Definition: indexlr.hpp:52
static const unsigned FILTER_IN
Definition: indexlr.hpp:43
static const unsigned FILTER_OUT
Definition: indexlr.hpp:48
static const unsigned NO_ID
Definition: indexlr.hpp:36
static const unsigned SHORT_MODE
Definition: indexlr.hpp:50
Definition: indexlr.hpp:64
Definition: indexlr.hpp:88