1#ifndef BTLLIB_INDEXLR_HPP
2#define BTLLIB_INDEXLR_HPP
4#include "btllib/bloom_filter.hpp"
5#include "btllib/nthash.hpp"
6#include "btllib/order_queue.hpp"
7#include "btllib/seq_reader.hpp"
8#include "btllib/status.hpp"
9#include "btllib/util.hpp"
36 static const unsigned NO_ID = 1;
38 static const unsigned BX = 2;
40 static const unsigned SEQ = 4;
55 bool output_id()
const {
return bool(~flags &
Flag::NO_ID); }
56 bool output_bx()
const {
return bool(flags &
Flag::BX); }
57 bool output_seq()
const {
return bool(flags &
Flag::SEQ); }
79 uint64_t min_hash = 0, out_hash = 0;
95 std::vector<Minimizer> minimizers)
98 , barcode(std::move(barcode))
100 , minimizers(std::move(minimizers))
107 std::vector<Minimizer> minimizers;
109 operator bool()
const
111 return !
id.empty() || !barcode.empty() || !minimizers.empty();
139 unsigned threads = 5,
140 bool verbose =
false,
146 void close() noexcept;
148 static const
size_t MAX_SIMULTANEOUS_INDEXLRS = 256;
155 void operator++() { record = indexlr.read(); }
156 bool operator!=(
const RecordIterator& i)
158 return bool(record) || bool(i.record);
160 Record operator*() {
return std::move(record); }
164 auto val = operator*();
172 RecordIterator(
Indexlr& indexlr,
bool end)
185 RecordIterator
begin() {
return RecordIterator(*
this,
false); }
186 RecordIterator end() {
return RecordIterator(*
this,
true); }
189 static std::string extract_barcode(
const std::string&
id,
190 const std::string& comment);
191 static void filter_hashed_kmer(Indexlr::HashedKmer& hk,
194 const BloomFilter& filter_in_bf,
195 const BloomFilter& filter_out_bf);
196 static void calc_minimizer(
197 const std::vector<Indexlr::HashedKmer>& hashed_kmers_buffer,
198 const Indexlr::Minimizer*& min_current,
200 ssize_t& min_idx_left,
201 ssize_t& min_idx_right,
202 ssize_t& min_pos_prev,
204 std::vector<Indexlr::Minimizer>& minimizers);
205 std::vector<Minimizer> minimize(
const std::string& seq)
const;
207 const std::string seqfile;
209 const unsigned flags;
212 std::atomic<bool> closed{
false };
214 static const BloomFilter& dummy_bf()
216 static const BloomFilter var;
220 const std::reference_wrapper<const BloomFilter> filter_in_bf;
221 const std::reference_wrapper<const BloomFilter> filter_out_bf;
222 bool filter_in_enabled;
223 bool filter_out_enabled;
226 OrderQueueMPSC<Record> output_queue;
228 using OutputQueueType =
decltype(output_queue);
229 static std::unique_ptr<OutputQueueType::Block>* ready_blocks_array()
231 thread_local static std::unique_ptr<
decltype(output_queue)::Block>
232 var[MAX_SIMULTANEOUS_INDEXLRS];
236 static long* ready_blocks_owners()
238 thread_local static long var[MAX_SIMULTANEOUS_INDEXLRS] = { 0 };
242 static size_t* ready_blocks_current()
244 thread_local static size_t var[MAX_SIMULTANEOUS_INDEXLRS] = { 0 };
248 static std::atomic<long>& last_id()
250 static std::atomic<long> var(0);
257 void start() { t = std::thread(do_work,
this); }
258 void join() { t.join(); }
259 void set_id(
const int id) { this->
id = id; }
261 Worker& operator=(
const Worker& worker) =
delete;
262 Worker& operator=(Worker&& worker) =
delete;
267 Worker(
const Worker& worker)
268 : Worker(worker.indexlr)
270 Worker(Worker&& worker) noexcept
271 : Worker(worker.indexlr)
276 static void do_work(Worker* worker) { worker->work(); }
283 std::vector<Worker> workers;
285 std::mutex last_block_num_mutex;
286 uint64_t last_block_num = 0;
287 bool last_block_num_valid =
false;
293 const unsigned flags,
294 const unsigned threads,
298 : seqfile(std::move(seqfile))
304 , filter_in_bf(filter_in() ? bf1 :
Indexlr::dummy_bf())
305 , filter_out_bf(filter_out() ? filter_in() ? bf2 : bf1 :
Indexlr::dummy_bf())
306 , filter_in_enabled(filter_in())
307 , filter_out_enabled(filter_out())
308 , reader(this->seqfile,
311 , output_queue(reader.get_buffer_size(), reader.get_block_size())
312 , workers(std::vector<Worker>(threads, Worker(*this)))
313 , end_barrier(threads)
316 "Indexlr: no mode selected, either short or long mode flag must "
319 "Indexlr: short and long mode are mutually exclusive.");
321 "Indexlr: Number of processing threads cannot be 0.");
323 for (
auto& worker : workers) {
324 worker.set_id(id_counter++);
329inline Indexlr::~Indexlr()
335Indexlr::close() noexcept
337 bool closed_expected =
false;
338 if (closed.compare_exchange_strong(closed_expected,
true)) {
341 output_queue.close();
342 for (
auto& worker : workers) {
345 }
catch (
const std::system_error& e) {
346 log_error(
"Indexlr thread join failure: " + std::string(e.what()));
347 std::exit(EXIT_FAILURE);
380Indexlr::extract_barcode(
const std::string&
id,
const std::string& comment)
382 const static std::string barcode_prefix =
"BX:Z:";
384 const auto space_pos = comment.find(
' ');
385 if (space_pos != std::string::npos) {
386 return comment.substr(barcode_prefix.size(),
387 space_pos - barcode_prefix.size());
389 return comment.substr(barcode_prefix.size());
391 const auto pound_pos =
id.find(
'#');
392 if (pound_pos != std::string::npos) {
393 const auto slash_pos =
id.find(
'/');
394 if (slash_pos > pound_pos) {
395 return id.substr(pound_pos + 1, slash_pos - (pound_pos + 1));
402Indexlr::filter_hashed_kmer(Indexlr::HashedKmer& hk,
405 const BloomFilter& filter_in_bf,
406 const BloomFilter& filter_out_bf)
408 if (filter_in && filter_out) {
409 std::vector<uint64_t> tmp;
410 tmp = { hk.min_hash };
411 if (!filter_in_bf.contains(tmp) || filter_out_bf.contains(tmp)) {
412 hk.min_hash = std::numeric_limits<uint64_t>::max();
414 }
else if (filter_in) {
415 if (!filter_in_bf.contains({ hk.min_hash })) {
416 hk.min_hash = std::numeric_limits<uint64_t>::max();
418 }
else if (filter_out) {
419 if (filter_out_bf.contains({ hk.min_hash })) {
420 hk.min_hash = std::numeric_limits<uint64_t>::max();
426Indexlr::calc_minimizer(
427 const std::vector<Indexlr::HashedKmer>& hashed_kmers_buffer,
428 const Indexlr::Minimizer*& min_current,
430 ssize_t& min_idx_left,
431 ssize_t& min_idx_right,
432 ssize_t& min_pos_prev,
434 std::vector<Indexlr::Minimizer>& minimizers)
436 min_idx_left = ssize_t(idx + 1 - w);
437 min_idx_right = ssize_t(idx + 1);
438 const auto& min_left =
439 hashed_kmers_buffer[min_idx_left % hashed_kmers_buffer.size()];
440 const auto& min_right =
441 hashed_kmers_buffer[(min_idx_right - 1) % hashed_kmers_buffer.size()];
443 if (min_current ==
nullptr || min_current->pos < min_left.pos) {
444 min_current = &min_left;
446 for (ssize_t i = min_idx_left; i < min_idx_right; i++) {
447 const auto& min_i = hashed_kmers_buffer[i % hashed_kmers_buffer.size()];
448 if (min_i.min_hash <= min_current->min_hash) {
449 min_current = &min_i;
452 }
else if (min_right.min_hash <= min_current->min_hash) {
453 min_current = &min_right;
455 if (ssize_t(min_current->pos) > min_pos_prev &&
456 min_current->min_hash != std::numeric_limits<uint64_t>::max()) {
457 min_pos_prev = ssize_t(min_current->pos);
458 minimizers.push_back(*min_current);
462inline std::vector<Indexlr::Minimizer>
463Indexlr::minimize(
const std::string& seq)
const
465 if ((k > seq.size()) || (w > seq.size() - k + 1)) {
468 std::vector<Minimizer> minimizers;
469 minimizers.reserve(2 * (seq.size() - k + 1) / w);
470 std::vector<HashedKmer> hashed_kmers_buffer(w + 1);
471 ssize_t min_idx_left, min_idx_right, min_pos_prev = -1;
472 const Minimizer* min_current =
nullptr;
474 for (NtHash nh(seq, 2, k); nh.roll(); ++idx) {
475 auto& hk = hashed_kmers_buffer[idx % hashed_kmers_buffer.size()];
477 hk = HashedKmer(nh.hashes()[0],
481 output_seq() ? seq.substr(nh.get_pos(), k) :
"");
484 hk, filter_in(), filter_out(), filter_in_bf.get(), filter_out_bf.get());
487 calc_minimizer(hashed_kmers_buffer,
500inline Indexlr::Record
503 if (ready_blocks_owners()[
id % MAX_SIMULTANEOUS_INDEXLRS] !=
id) {
504 ready_blocks_array()[
id % MAX_SIMULTANEOUS_INDEXLRS] =
505 std::unique_ptr<decltype(output_queue)::Block>(
506 new decltype(output_queue)::Block(reader.get_block_size()));
507 ready_blocks_owners()[
id % MAX_SIMULTANEOUS_INDEXLRS] = id;
508 ready_blocks_current()[
id % MAX_SIMULTANEOUS_INDEXLRS] = 0;
510 auto& block = *(ready_blocks_array()[
id % MAX_SIMULTANEOUS_INDEXLRS]);
511 auto& current = ready_blocks_current()[
id % MAX_SIMULTANEOUS_INDEXLRS];
512 if (current >= block.count) {
514 output_queue.read(block);
515 if (block.count == 0) {
516 output_queue.close();
517 block =
decltype(output_queue)::Block(reader.get_block_size());
522 return std::move(block.data[current++]);
526Indexlr::Worker::work()
528 decltype(indexlr.output_queue)::Block output_block(
529 indexlr.reader.get_block_size());
530 uint64_t last_block_num = 0;
531 bool last_block_num_valid =
false;
533 auto input_block = indexlr.reader.read_block();
534 if (input_block.count == 0) {
538 output_block.num = input_block.num;
539 for (
size_t idx = 0; idx < input_block.count; idx++) {
541 auto& reader_record = input_block.data[idx];
542 record.num = reader_record.num;
543 if (indexlr.output_id()) {
544 record.id = std::move(reader_record.id);
546 if (indexlr.output_bx()) {
548 indexlr.extract_barcode(record.id, reader_record.comment);
550 record.readlen = reader_record.seq.size();
552 check_info(indexlr.verbose && indexlr.k > record.readlen,
553 "Indexlr: skipped seq " + std::to_string(record.num) +
555 std::to_string(record.num * (indexlr.reader.get_format() ==
556 SeqReader::Format::FASTA
560 "; k (" + std::to_string(indexlr.k) +
") > seq length (" +
561 std::to_string(record.readlen) +
")");
563 check_info(indexlr.verbose && indexlr.w > record.readlen - indexlr.k + 1,
564 "Indexlr: skipped seq " + std::to_string(record.num) +
566 std::to_string(record.num * (indexlr.reader.get_format() ==
567 SeqReader::Format::FASTA
571 "; w (" + std::to_string(indexlr.w) +
") > # of hashes (" +
572 std::to_string(record.readlen - indexlr.k + 1) +
")");
574 if (indexlr.k <= record.readlen &&
575 indexlr.w <= record.readlen - indexlr.k + 1) {
576 record.minimizers = indexlr.minimize(reader_record.seq);
578 record.minimizers = {};
581 output_block.data[output_block.count++] = std::move(record);
583 if (output_block.count > 0) {
584 last_block_num = output_block.num;
585 last_block_num_valid =
true;
586 indexlr.output_queue.write(output_block);
587 output_block.count = 0;
590 if (last_block_num_valid) {
591 std::unique_lock<std::mutex> lock(indexlr.last_block_num_mutex);
592 indexlr.last_block_num = std::max(indexlr.last_block_num, last_block_num);
593 indexlr.last_block_num_valid =
true;
596 indexlr.end_barrier.wait();
597 if (last_block_num_valid && indexlr.last_block_num_valid &&
598 last_block_num == indexlr.last_block_num) {
599 output_block.num = last_block_num + 1;
600 indexlr.output_queue.write(output_block);
601 }
else if (!indexlr.last_block_num_valid &&
id == 0) {
602 output_block.num = 0;
603 indexlr.output_queue.write(output_block);
Definition: bloom_filter.hpp:67
Definition: indexlr.hpp:26
Indexlr(std::string seqfile, size_t k, size_t w, unsigned flags=0, unsigned threads=5, bool verbose=false, const btllib::BloomFilter &bf1=Indexlr::dummy_bf(), const btllib::BloomFilter &bf2=Indexlr::dummy_bf())
Definition: indexlr.hpp:290
RecordIterator begin()
Definition: indexlr.hpp:185
Definition: seq_reader.hpp:43
Definition: bloom_filter.hpp:16
std::string join(const std::vector< std::string > &s, const std::string &delim)
void check_error(bool condition, const std::string &msg)
void log_error(const std::string &msg)
bool startswith(std::string s, std::string prefix)
void check_info(bool condition, const std::string &msg)
Definition: indexlr.hpp:34
static const unsigned BX
Definition: indexlr.hpp:38
static const unsigned SEQ
Definition: indexlr.hpp:40
static const unsigned LONG_MODE
Definition: indexlr.hpp:52
static const unsigned FILTER_IN
Definition: indexlr.hpp:43
static const unsigned FILTER_OUT
Definition: indexlr.hpp:48
static const unsigned NO_ID
Definition: indexlr.hpp:36
static const unsigned SHORT_MODE
Definition: indexlr.hpp:50
Definition: indexlr.hpp:64
Definition: indexlr.hpp:88