btllib
seq_reader.hpp
1#ifndef BTLLIB_SEQ_READER_HPP
2#define BTLLIB_SEQ_READER_HPP
3
4#include "btllib/cstring.hpp"
5#include "btllib/data_stream.hpp"
6#include "btllib/order_queue.hpp"
7#include "btllib/seq.hpp"
8#include "btllib/seq_reader_fasta_module.hpp"
9#include "btllib/seq_reader_fastq_module.hpp"
10#include "btllib/seq_reader_gfa2_module.hpp"
11#include "btllib/seq_reader_multiline_fasta_module.hpp"
12#include "btllib/seq_reader_multiline_fastq_module.hpp"
13#include "btllib/seq_reader_sam_module.hpp"
14#include "btllib/status.hpp"
15
16#include <atomic>
17#include <cctype>
18#include <condition_variable>
19#include <cstdio>
20#include <cstdlib>
21#include <cstring>
22#include <limits>
23#include <memory>
24#include <mutex>
25#include <stack>
26#include <string>
27#include <thread>
28#include <vector>
29
30namespace btllib {
31
43{
44public:
45 /* Has to be a struct and not an enum because:
46 * 1) Non-class enums are not name qualified and can collide
47 * 2) class enums can't be implicitly converted into integers
48 */
49 struct Flag
50 {
52 static const unsigned FOLD_CASE = 1;
55 static const unsigned TRIM_MASKED = 2;
57 static const unsigned SHORT_MODE = 4;
59 static const unsigned LONG_MODE = 8;
60 };
61
70 SeqReader(const std::string& source_path,
71 unsigned flags,
72 unsigned threads = 3);
73
74 SeqReader(const SeqReader&) = delete;
75 SeqReader(SeqReader&&) = delete;
76
77 SeqReader& operator=(const SeqReader&) = delete;
78 SeqReader& operator=(SeqReader&&) = delete;
79
80 ~SeqReader();
81
82 void close() noexcept;
83
84 bool fold_case() const { return bool(flags & Flag::FOLD_CASE); }
85 bool trim_masked() const { return bool(flags & Flag::TRIM_MASKED); }
86 bool short_mode() const { return bool(flags & Flag::SHORT_MODE); }
87 bool long_mode() const { return bool(flags & Flag::LONG_MODE); }
88
89 enum class Format
90 {
91 UNDETERMINED,
92 FASTA,
93 FASTQ,
94 SAM,
95 GFA2,
96 INVALID
97 };
98
99 friend std::ostream& operator<<(std::ostream& os, const Format f)
100 {
101 return os << static_cast<int32_t>(f);
102 }
103
104 Format get_format() const { return format; }
105
106 struct Record
107 {
108 size_t num = std::numeric_limits<size_t>::max();
109 std::string id;
110 std::string comment;
111 std::string seq;
112 std::string qual;
113
114 operator bool() const { return !seq.empty(); }
115 };
116
119
121 OrderQueueMPMC<Record>::Block read_block();
122
123 static const size_t MAX_SIMULTANEOUS_SEQREADERS = 256;
124
127 class RecordIterator
128 {
129 public:
130 void operator++() { record = reader.read(); }
131 bool operator!=(const RecordIterator& i)
132 {
133 return bool(record) || bool(i.record);
134 }
135 Record operator*() { return std::move(record); }
136 // For wrappers
137 Record next()
138 {
139 auto val = operator*();
140 operator++();
141 return val;
142 }
143
144 private:
145 friend SeqReader;
146
147 RecordIterator(SeqReader& reader, bool end)
148 : reader(reader)
149 {
150 if (!end) {
151 operator++();
152 }
153 }
154
155 SeqReader& reader;
156 Record record;
157 };
159
160 RecordIterator begin() { return RecordIterator(*this, false); }
161 RecordIterator end() { return RecordIterator(*this, true); }
162
163 size_t get_buffer_size() const { return buffer_size; }
164 size_t get_block_size() const { return block_size; }
165
166 static const size_t SHORT_MODE_BUFFER_SIZE = 32;
167 static const size_t SHORT_MODE_BLOCK_SIZE = 32;
168
169 static const size_t LONG_MODE_BUFFER_SIZE = 4;
170 static const size_t LONG_MODE_BLOCK_SIZE = 1;
171
172 static const size_t FORMAT_BUFFER_SIZE = 16384;
173
174private:
175 struct Buffer
176 {
177
178 Buffer()
179 : data(FORMAT_BUFFER_SIZE)
180 {}
181
182 std::vector<char> data;
183 size_t start = 0;
184 size_t end = 0;
185 bool eof_newline_inserted = false;
186 };
187
188 struct RecordCString
189 {
190 CString header;
191 CString seq;
192 CString qual;
193 };
194
195 const std::string& source_path;
196 DataSource source;
197 const unsigned flags;
198 const unsigned threads;
199 Format format = Format::UNDETERMINED; // Format of the source file
200 std::atomic<bool> closed{ false };
201 Buffer buffer;
202 std::unique_ptr<std::thread> reader_thread;
203 std::vector<std::unique_ptr<std::thread>> processor_threads;
204 std::mutex format_mutex;
205 std::condition_variable format_cv;
206 std::atomic<bool> reader_end{ false };
207 RecordCString* reader_record = nullptr;
208 const std::atomic<size_t> buffer_size;
209 const std::atomic<size_t> block_size;
210 OrderQueueSPMC<RecordCString> cstring_queue;
211 OrderQueueMPMC<Record> output_queue;
212 std::atomic<size_t> dummy_block_num{ 0 };
213 const long id;
214
215 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
216 thread_local static std::unique_ptr<decltype(output_queue)::Block>
217 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
218 ready_records_array[MAX_SIMULTANEOUS_SEQREADERS];
219
220 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
221 thread_local static long ready_records_owners[MAX_SIMULTANEOUS_SEQREADERS];
222
223 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
224 thread_local static size_t ready_records_current[MAX_SIMULTANEOUS_SEQREADERS];
225
226 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
227 static std::atomic<long> last_id;
228
229 bool load_buffer();
230 void determine_format();
231 void start_reader();
232 void start_processors();
233
234 CString tmp;
235 bool readline_buffer_append(CString& s);
236 static void readline_file(CString& s, FILE* f);
237 void readline_file_append(CString& s, FILE* f);
238 static bool file_at_end(FILE* f);
239 int getc_buffer();
240 int ungetc_buffer(int c);
241
242 void update_cstring_records(OrderQueueSPMC<RecordCString>::Block& records,
243 size_t& counter);
244
246 template<typename Module>
247 void read_from_buffer(Module& module,
248 OrderQueueSPMC<RecordCString>::Block& records,
249 size_t& counter);
250
251 template<typename Module>
252 void read_transition(Module& module,
253 OrderQueueSPMC<RecordCString>::Block& records,
254 size_t& counter);
255
256 template<typename Module>
257 void read_from_file(Module& module,
258 OrderQueueSPMC<RecordCString>::Block& records,
259 size_t& counter);
261
262 friend class SeqReaderFastaModule;
263 SeqReaderFastaModule fasta_module;
264
265 friend class SeqReaderMultilineFastaModule;
266 SeqReaderMultilineFastaModule multiline_fasta_module;
267
268 friend class SeqReaderFastqModule;
269 SeqReaderFastqModule fastq_module;
270
271 friend class SeqReaderMultilineFastqModule;
272 SeqReaderMultilineFastqModule multiline_fastq_module;
273
274 friend class SeqReaderSamModule;
275 SeqReaderSamModule sam_module;
276
277 friend class SeqReaderGfa2Module;
278 SeqReaderGfa2Module gfa2_module;
279
280 int module_in_use = 0;
281
282 void postprocess();
283};
284
285template<typename Module>
286inline void
287SeqReader::read_from_buffer(Module& module,
288 OrderQueueSPMC<RecordCString>::Block& records,
289 size_t& counter)
290{
291 while (!reader_end) {
292 reader_record = &(records.data[records.count]);
293 if (!module.read_buffer(*this, *reader_record) ||
294 reader_record->seq.empty()) {
295 break;
296 }
297 update_cstring_records(records, counter);
298 }
299}
300
301template<typename Module>
302inline void
303SeqReader::read_transition(Module& module,
304 OrderQueueSPMC<RecordCString>::Block& records,
305 size_t& counter)
306{
307 if (!reader_end) {
308 reader_record = &(records.data[records.count]);
309 module.read_transition(*this, *reader_record);
310 if (!reader_record->seq.empty()) {
311 update_cstring_records(records, counter);
312 }
313 } else if (!reader_record->seq.empty()) {
314 update_cstring_records(records, counter);
315 }
316}
317
318template<typename Module>
319inline void
320SeqReader::read_from_file(Module& module,
321 OrderQueueSPMC<RecordCString>::Block& records,
322 size_t& counter)
323{
324 while (!reader_end) {
325 reader_record = &(records.data[records.count]);
326 if (!module.read_file(*this, *reader_record) ||
327 reader_record->seq.empty()) {
328 break;
329 }
330 update_cstring_records(records, counter);
331 }
332}
333
334} // namespace btllib
335
336#endif
Definition: seq_reader.hpp:43
SeqReader(const std::string &source_path, unsigned flags, unsigned threads=3)
OrderQueueMPMC< Record >::Block read_block()
RecordIterator begin()
Definition: seq_reader.hpp:160
Definition: bloom_filter.hpp:16
Definition: seq_reader.hpp:50
static const unsigned TRIM_MASKED
Definition: seq_reader.hpp:55
static const unsigned FOLD_CASE
Definition: seq_reader.hpp:52
static const unsigned SHORT_MODE
Definition: seq_reader.hpp:57
static const unsigned LONG_MODE
Definition: seq_reader.hpp:59
Definition: seq_reader.hpp:107