1 #ifndef BTLLIB_DATA_STREAM_HPP
2 #define BTLLIB_DATA_STREAM_HPP
23 #include <sys/types.h>
29 static const int PIPE_READ_END = 0;
30 static const int PIPE_WRITE_END = 1;
31 static const int COMM_BUFFER_SIZE = 1024;
32 static const mode_t PIPE_PERMISSIONS = 0666;
34 using PipeId =
unsigned long;
38 process_spawner_initialized()
40 static bool _process_spawner_initialized;
41 return _process_spawner_initialized;
44 process_spawner_parent2child_fd()
46 static int _process_spawner_parent2child_fd[2];
47 return _process_spawner_parent2child_fd;
50 process_spawner_child2parent_fd()
52 static int _process_spawner_child2parent_fd[2];
53 return _process_spawner_child2parent_fd;
56 process_spawner_comm_mutex()
58 static std::mutex _process_spawner_comm_mutex;
59 return _process_spawner_comm_mutex;
61 inline std::vector<pid_t>&
64 static std::vector<pid_t> _may_fail;
70 static std::mutex _may_fail_mutex;
71 return _may_fail_mutex;
76 static PipeId _last_pipe_id = 0;
77 return _last_pipe_id++;
79 inline std::map<std::string, _Pipeline>&
82 static std::map<std::string, _Pipeline> _pipeline_map;
86 static inline std::string
87 get_pipepath(
const PipeId
id)
89 return "btllib-" + std::to_string(getpid()) +
"-" + std::to_string(
id);
103 DataStream(
const std::string& path, Operation op);
107 FILE* operator*()
const {
return file; }
108 FILE* operator->()
const {
return file; }
109 operator FILE*()
const {
return file; }
112 std::string streampath;
114 std::string pipepath;
115 FILE* file =
nullptr;
132 DataSink(
const std::string& path,
bool append =
false)
137 inline DataStream::DataStream(
const std::string& path, Operation op)
141 std::unique_lock<std::mutex> lock(process_spawner_comm_mutex());
143 write(process_spawner_parent2child_fd()[PIPE_WRITE_END], &op,
sizeof(op));
145 size_t pathlen = path.size() + 1;
146 check_error(pathlen > COMM_BUFFER_SIZE,
147 "Stream path length too large for the buffer.");
148 write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
152 process_spawner_parent2child_fd()[PIPE_WRITE_END], path.c_str(), pathlen);
154 char buf[COMM_BUFFER_SIZE];
155 read(process_spawner_child2parent_fd()[PIPE_READ_END],
158 read(process_spawner_child2parent_fd()[PIPE_READ_END], buf, pathlen);
161 file = fopen(pipepath.c_str(), op == READ ?
"r" :
"w");
162 unlink(pipepath.c_str());
169 std::unique_lock<std::mutex> lock(process_spawner_comm_mutex());
175 process_spawner_parent2child_fd()[PIPE_WRITE_END], &op,
sizeof(op));
177 size_t pathlen = pipepath.size() + 1;
178 check_error(pathlen > COMM_BUFFER_SIZE,
179 "Stream path length too large for the buffer.");
180 write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
183 write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
187 read(process_spawner_child2parent_fd()[PIPE_READ_END], &op, 1);
191 }
else if (op == WRITE || op == APPEND) {
193 if (file != stdout) {
197 process_spawner_parent2child_fd()[PIPE_WRITE_END], &op,
sizeof(op));
199 size_t pathlen = pipepath.size() + 1;
200 check_error(pathlen > COMM_BUFFER_SIZE,
201 "Stream path length too large for the buffer.");
202 write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
205 write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
209 read(process_spawner_child2parent_fd()[PIPE_READ_END], &op, 1);
233 : pipepath(std::move(pipepath))
234 , direction(direction)
235 , pid_first(pid_first)
241 std::string pipepath;
242 Direction direction = SOURCE;
243 pid_t pid_first = -1;
252 if (direction == SOURCE) {
254 std::unique_lock<std::mutex> lock(may_fail_mutex());
255 may_fail().push_back(pid_first);
257 kill(pid_first, SIGTERM);
259 waitpid(pid_last, &status, 0);
260 }
else if (direction == SINK) {
262 waitpid(pid_last, &status, 0);
270 process_spawner_init();
272 static const bool process_spawner_initializer = process_spawner_init();
275 sigchld_handler(
const int sig)
277 assert(sig == SIGCHLD);
282 while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
285 std::unique_lock<std::mutex> lock(may_fail_mutex());
286 auto it = std::find(may_fail().begin(), may_fail().end(), pid);
287 if (it != may_fail().end()) {
288 may_fail().erase(it);
293 if (WIFEXITED(status)) {
294 std::cerr <<
"PID " << pid <<
" exited with status "
295 << WEXITSTATUS(status) << std::endl;
296 }
else if (WIFSIGNALED(status)) {
297 std::cerr <<
"PID " << pid <<
" killed by signal "
301 std::cerr <<
"PID " << pid <<
" exited with code " << status
304 std::exit(EXIT_FAILURE);
307 if (pid == -1 && errno != ECHILD) {
308 std::perror(
"waitpid");
309 std::exit(EXIT_FAILURE);
313 static inline std::string
314 get_pipeline_cmd(
const std::string& path, DataStream::Operation op);
316 static inline _Pipeline
317 run_pipeline_cmd(
const std::string& cmd, DataStream::Operation op);
320 process_spawner_init()
322 if (!process_spawner_initialized()) {
323 process_spawner_initialized() =
true;
325 process_spawner_parent2child_fd()[PIPE_READ_END] = -1;
326 process_spawner_parent2child_fd()[PIPE_WRITE_END] = -1;
327 process_spawner_child2parent_fd()[PIPE_READ_END] = -1;
328 process_spawner_child2parent_fd()[PIPE_WRITE_END] = -1;
329 check_error(pipe(process_spawner_parent2child_fd()) == -1,
330 "Error opening a pipe.");
331 check_error(pipe(process_spawner_child2parent_fd()) == -1,
332 "Error opening a pipe.");
336 close(process_spawner_parent2child_fd()[PIPE_WRITE_END]);
337 close(process_spawner_child2parent_fd()[PIPE_READ_END]);
339 struct sigaction action;
340 action.sa_handler = sigchld_handler;
341 sigemptyset(&action.sa_mask);
342 action.sa_flags = SA_RESTART;
343 sigaction(SIGCHLD, &action,
nullptr);
345 DataStream::Operation op;
346 char buf[COMM_BUFFER_SIZE];
350 if (read(process_spawner_parent2child_fd()[PIPE_READ_END],
354 std::exit(EXIT_SUCCESS);
357 read(process_spawner_parent2child_fd()[PIPE_READ_END],
360 read(process_spawner_parent2child_fd()[PIPE_READ_END], buf, pathlen);
363 case DataStream::Operation::READ:
364 case DataStream::Operation::WRITE:
365 case DataStream::Operation::APPEND:
366 pipeline = run_pipeline_cmd(get_pipeline_cmd(buf, op), op);
368 pathlen = pipeline.pipepath.size() + 1;
369 check_error(pathlen > COMM_BUFFER_SIZE,
370 "Stream path length too large for the buffer.");
371 write(process_spawner_child2parent_fd()[PIPE_WRITE_END],
374 write(process_spawner_child2parent_fd()[PIPE_WRITE_END],
375 pipeline.pipepath.c_str(),
378 pipeline_map()[pipeline.pipepath] = pipeline;
380 case DataStream::Operation::CLOSE:
381 pipeline = pipeline_map()[std::string(buf)];
383 pipeline_map().erase(std::string(buf));
384 write(process_spawner_child2parent_fd()[PIPE_WRITE_END], &op, 1);
387 log_error(
"Invalid stream operation.");
388 std::exit(EXIT_FAILURE);
392 close(process_spawner_parent2child_fd()[PIPE_READ_END]);
393 close(process_spawner_child2parent_fd()[PIPE_WRITE_END]);
398 static inline std::string
399 get_pipeline_cmd(
const std::string& path, DataStream::Operation op)
403 std::vector<std::string> prefixes;
404 std::vector<std::string> suffixes;
405 std::vector<std::string> cmds_check_existence;
406 std::vector<std::string> read_cmds;
407 std::vector<std::string> write_cmds;
408 std::vector<std::string> append_cmds;
412 static const Datatype DATATYPES[]{
413 { {
"http://",
"https://",
"ftp://" }, {}, {
"which wget" }, {
"wget -O-" }, {
"" }, {
"" } },
414 { {}, {
".url" }, {
"which wget" }, {
"wget -O- -i" }, {
"" }, {
"" } },
415 { {}, {
".ar" }, {
"which ar" }, {
"ar -p" }, {
"" }, {
"" } },
416 { {}, {
".tar" }, {
"which tar" }, {
"tar -xOf" }, {
"" }, {
"" } },
417 { {}, {
".tgz" }, {
"which tar" }, {
"tar -zxOf" }, {
"" }, {
"" } },
418 { {}, {
".gz",
".z" }, {
"which pigz",
"which gzip" }, {
"pigz -dc",
"gzip -dc" }, {
"pigz >",
"gzip >" }, {
"pigz >>",
"gzip >>" } },
419 { {}, {
".bz2" }, {
"which bzip2" }, {
"bunzip2 -dc" }, {
"bzip2 >" }, {
"bzip2 >>" } },
420 { {}, {
".xz" }, {
"which xz" }, {
"unxz -dc" }, {
"xz -T0 >" }, {
"xz -T0 >>" } },
421 { {}, {
".7z" }, {
"which 7z" }, {
"7z -so e" }, {
"7z -si a" }, {
"7z -si a" } },
422 { {}, {
".zip" }, {
"which zip" }, {
"unzip -p" }, {
"" }, {
"" } },
423 { {}, {
".bam",
".cram" }, {
"which samtools" }, {
"samtools view -h" }, {
"samtools -Sb - >" }, {
"samtools -Sb - >>" } },
426 std::string default_cmd =
"cat";
427 if (op == DataStream::Operation::WRITE) {
429 }
else if (op == DataStream::Operation::APPEND) {
430 default_cmd +=
" >>";
433 std::string path_trimmed = path;
434 std::vector<std::string> cmd_layers;
436 bool found_datatype =
false;
437 for (
const auto& datatype : DATATYPES) {
438 size_t trim_start = 0, trim_end = 0;
439 bool this_datatype =
false;
440 for (
const auto& prefix : datatype.prefixes) {
441 if (starts_with(path_trimmed, prefix)) {
442 this_datatype =
true;
443 trim_start += prefix.size();
447 for (
const auto& suffix : datatype.suffixes) {
448 if (ends_with(path_trimmed, suffix)) {
449 this_datatype =
true;
450 trim_end += suffix.size();
456 found_datatype =
true;
457 bool found_cmd =
false;
459 for (
const auto& existence_cmd : datatype.cmds_check_existence) {
461 auto sub_cmds = split(existence_cmd,
"&&");
462 std::for_each(sub_cmds.begin(), sub_cmds.end(), trim);
463 for (
const auto& sub_cmd : sub_cmds) {
464 auto args = split(sub_cmd,
" ");
465 std::for_each(args.begin(), args.end(), trim);
467 char*
const* argv =
new char*[args.size() + 2];
468 ((
char*&)(argv[0])) = (
char*)(args[0].c_str());
469 for (
size_t i = 0; i < args.size(); i++) {
470 ((
char*&)(argv[i + 1])) = (
char*)(args[i].c_str());
472 ((
char*&)(argv[args.size() + 1])) =
nullptr;
476 std::unique_lock<std::mutex> lock(may_fail_mutex());
478 may_fail().push_back(pid);
481 int null_fd = open(
"/dev/null", O_WRONLY, 0);
482 dup2(null_fd, STDOUT_FILENO);
483 dup2(null_fd, STDERR_FILENO);
486 execvp(argv[0], argv + 1);
487 log_error(
"exec failed.");
488 std::exit(EXIT_FAILURE);
491 check_error(pid == -1,
"Error on fork.");
493 waitpid(pid, &status, 0);
494 if (WIFSIGNALED(status) ||
495 (WIFEXITED(status) && WEXITSTATUS(status) != 0)) {
500 std::unique_lock<std::mutex> lock(may_fail_mutex());
501 auto it = std::find(may_fail().begin(), may_fail().end(), pid);
502 if (it != may_fail().end()) {
503 may_fail().erase(it);
518 case DataStream::Operation::READ:
519 cmd = datatype.read_cmds[cmd_idx];
521 case DataStream::Operation::WRITE:
522 cmd = datatype.write_cmds[cmd_idx];
524 case DataStream::Operation::APPEND:
525 cmd = datatype.append_cmds[cmd_idx];
528 log_error(
"Invalid operation");
529 std::exit(EXIT_FAILURE);
532 log_warning(
"Filetype recognized for '" + path +
533 "', but no tool available to work with it.");
535 cmd_layers.push_back(cmd);
538 log_warning(
"Filetype recognized for '" + path +
539 "', but no tool available to work with it.");
541 path_trimmed.erase(0, trim_start);
542 path_trimmed.erase(path_trimmed.size() - trim_end);
545 if (!found_datatype) {
549 if (cmd_layers.empty()) {
550 cmd_layers.push_back(default_cmd);
552 if (op == DataStream::Operation::WRITE ||
553 op == DataStream::Operation::APPEND) {
554 std::reverse(cmd_layers.begin(), cmd_layers.end());
557 std::string result_cmd;
558 for (
size_t i = 0; i < cmd_layers.size(); i++) {
559 auto& cmd = cmd_layers[i];
560 if (op == DataStream::Operation::WRITE ||
561 op == DataStream::Operation::APPEND) {
562 if (i == cmd_layers.size() - 1) {
563 if (cmd.back() ==
'>') {
570 if (cmd.back() ==
'>') {
571 while (cmd.back() ==
'>' || cmd.back() ==
' ') {
592 check_error(result_cmd.empty(),
593 (op == DataStream::Operation::READ ?
"Error loading from "
594 :
"Error saving to ") +
599 static inline _Pipeline
600 run_pipeline_cmd(
const std::string& cmd, DataStream::Operation op)
602 std::string pipepath = get_pipepath(new_pipe_id());
603 unlink(pipepath.c_str());
604 mkfifo(pipepath.c_str(), PIPE_PERMISSIONS);
606 auto individual_cmds = split(cmd,
" | ");
607 check_error(individual_cmds.empty(),
608 "Error processing data stream commands.");
609 std::reverse(individual_cmds.begin(), individual_cmds.end());
611 std::vector<pid_t> pids;
613 int input_fd[2], output_fd[2];
614 input_fd[PIPE_READ_END] = -1;
615 input_fd[PIPE_WRITE_END] = -1;
616 output_fd[PIPE_READ_END] = -1;
617 output_fd[PIPE_WRITE_END] = -1;
620 for (
const auto& individual_cmd : individual_cmds) {
621 auto args = split(individual_cmd,
" ");
622 std::for_each(args.begin(), args.end(), trim);
624 std::string stdout_to_file;
625 decltype(args)::iterator it;
626 for (it = args.begin(); it != args.end(); ++it) {
627 if (it->front() ==
'>') {
628 stdout_to_file = it->substr(1);
632 if (it != args.end()) {
636 char*
const* argv =
new char*[args.size() + 2];
637 ((
char*&)(argv[0])) = (
char*)(args[0].c_str());
638 for (
size_t i = 0; i < args.size(); i++) {
639 ((
char*&)(argv[i + 1])) = (
char*)(args[i].c_str());
641 ((
char*&)(argv[args.size() + 1])) =
nullptr;
643 if (i < individual_cmds.size() - 1) {
644 check_error(pipe(input_fd) == -1,
"Error opening a pipe.");
645 fcntl(input_fd[PIPE_READ_END], F_SETFD, FD_CLOEXEC);
646 fcntl(input_fd[PIPE_WRITE_END], F_SETFD, FD_CLOEXEC);
651 if (op == DataStream::Operation::READ) {
653 int fd = open(pipepath.c_str(), O_WRONLY);
654 dup2(fd, STDOUT_FILENO);
657 dup2(output_fd[PIPE_WRITE_END], STDOUT_FILENO);
658 close(output_fd[PIPE_READ_END]);
659 close(output_fd[PIPE_WRITE_END]);
662 if (i < individual_cmds.size() - 1) {
663 dup2(input_fd[PIPE_READ_END], STDIN_FILENO);
664 close(input_fd[PIPE_READ_END]);
665 close(input_fd[PIPE_WRITE_END]);
668 execvp(argv[0], argv + 1);
669 log_error(
"exec failed.");
670 std::exit(EXIT_FAILURE);
672 if (i == individual_cmds.size() - 1) {
673 int fd = open(pipepath.c_str(), O_RDONLY);
674 dup2(fd, STDIN_FILENO);
677 dup2(input_fd[PIPE_READ_END], STDIN_FILENO);
678 close(input_fd[PIPE_READ_END]);
679 close(input_fd[PIPE_WRITE_END]);
682 if (!stdout_to_file.empty()) {
684 open(stdout_to_file.c_str(),
686 (op == DataStream::Operation::APPEND ? O_APPEND : 0),
687 S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
688 dup2(outfd, STDOUT_FILENO);
691 dup2(output_fd[PIPE_WRITE_END], STDOUT_FILENO);
692 close(output_fd[PIPE_READ_END]);
693 close(output_fd[PIPE_WRITE_END]);
696 execvp(argv[0], argv + 1);
697 log_error(
"exec failed.");
701 check_error(pid == -1,
"Error on fork.");
708 close(output_fd[PIPE_READ_END]);
709 close(output_fd[PIPE_WRITE_END]);
712 if (i < individual_cmds.size() - 1) {
713 output_fd[PIPE_READ_END] = input_fd[PIPE_READ_END];
714 output_fd[PIPE_WRITE_END] = input_fd[PIPE_WRITE_END];
720 return _Pipeline(pipepath,
721 op == DataStream::Operation::READ
722 ? _Pipeline::Direction::SOURCE
723 : _Pipeline::Direction::SINK,