• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #45

20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%
#45

push

travis-ci

web-flow
attempt to fix coveralls run

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/userconfig.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "userconfig.hpp"
21
#include "alignment.hpp" // for alignment_info
22
#include "commontypes.hpp"
23
#include "debug.hpp"    // for AR_REQUIRE, AR_FAIL
24
#include "errors.hpp"   // for fastq_error
25
#include "fastq.hpp"    // for ACGT, ACGT::indices, ACGT::values
26
#include "licenses.hpp" // for LICENSES
27
#include "logging.hpp"  // for log_stream, error, set_level, set_colors, info
28
#include "main.hpp"     // for HELPTEXT, NAME, VERSION
29
#include "output.hpp"   // for DEV_NULL, output_files, output_file
30
#include "progress.hpp" // for progress_type, progress_type::simple, progr...
31
#include "simd.hpp"     // for size_t, name, supported, instruction_set
32
#include "strutils.hpp" // for string_vec, shell_escape, str_to_unsigned
33
#include <algorithm>    // for find, max, min
34
#include <cerrno>       // for errno
35
#include <cmath>        // for pow
36
#include <cstdlib>      // for getenv
37
#include <cstring>      // for size_t, strerror, strcmp
38
#include <limits>       // for numeric_limits
39
#include <stdexcept>    // for invalid_argument
40
#include <string>       // for string, basic_string, operator==, operator+
41
#include <tuple>        // for get, tuple
42
#include <unistd.h>     // for access, isatty, R_OK, STDERR_FILENO
43

44
namespace adapterremoval {
45

46
namespace {
47

48
const char* HELPTEXT =
49
  "This program searches for and removes remnant adapter sequences, poly-X "
50
  "tails and low-quality base from FASTQ reads. For detailed explanation of "
51
  "the parameters, please refer to the man page. For comments, suggestions "
52
  "and feedback please use\n"
53
  "\n"
54
  "  https://github.com/MikkelSchubert/adapterremoval/issues/new\n"
55
  "\n"
56
  "If you use the program, please cite the paper\n"
57
  "\n"
58
  "  Schubert, Lindgreen, and Orlando (2016). AdapterRemoval v2: rapid\n"
59
  "  adapter trimming, identification, and read merging. BMC Research\n"
60
  "  Notes, 12;9(1):88. https://doi.org/10.1186/s13104-016-1900-2\n"
61
  "\n"
62
  "Use the filename '-' to read from STDIN or to write to STDOUT. If the same "
63
  "filenames are used for --in-file1 and --in-file2 then those files are read "
64
  "in interleaved mode. If the same filename is used for two or more of the "
65
  "--out options (excluding --out-json and --out-html), then output is "
66
  "written to that file in interleaved mode.\n";
67

68
////////////////////////////////////////////////////////////////////////////////
69
// Helper functions
70

71
std::pair<unsigned, unsigned>
72
parse_trim_argument(const string_vec& values)
×
73
{
74
  unsigned mate_1 = 0;
×
75
  unsigned mate_2 = 0;
×
76

77
  switch (values.size()) {
×
78
    case 1:
×
79
      mate_1 = str_to_unsigned(values.front());
×
80
      mate_2 = mate_1;
×
81
      break;
×
82

83
    case 2:
×
84
      mate_1 = str_to_unsigned(values.front());
×
85
      mate_2 = str_to_unsigned(values.back());
×
86
      break;
×
87

88
    default:
×
89
      throw std::invalid_argument("please specify exactly one or two values");
×
90
  }
91

92
  return { mate_1, mate_2 };
×
93
}
94

95
bool
96
parse_poly_x_option(const std::string& key,
×
97
                    const string_vec& values,
98
                    std::string& out)
99
{
100
  out.clear();
×
101
  if (values.empty()) {
×
102
    out = "ACGT";
×
103
    return true;
104
  }
105

106
  std::array<bool, ACGT::indices> enabled = {};
×
107
  for (const auto& value : values) {
×
108
    for (const auto nuc : to_upper(value)) {
×
109
      switch (nuc) {
×
110
        case 'A':
×
111
        case 'C':
×
112
        case 'G':
×
113
        case 'T':
×
114
          enabled.at(ACGT::to_index(nuc)) = true;
×
115
          break;
×
116

117
        default:
×
118
          log::error() << "Option " << key << " called with invalid value "
×
119
                       << shell_escape(value) << ". Only A, C, G, and T are "
×
120
                       << "permitted!";
×
121

122
          return false;
×
123
      }
124
    }
125
  }
126

127
  for (const auto nuc : ACGT::values) {
×
128
    if (enabled.at(ACGT::to_index(nuc))) {
×
129
      out.push_back(nuc);
×
130
    }
131
  }
132

133
  return true;
134
}
135

136
bool
137
parse_head(const std::string& sink, uint64_t& out)
×
138
{
139
  if (sink.empty()) {
×
140
    // Default to all reads
141
    out = std::numeric_limits<uint64_t>::max();
×
142
    return true;
×
143
  }
144

145
  uint64_t unit = 1;
×
146
  std::string sink_without_unit = sink;
×
147
  if (sink.back() < '0' || sink.back() > '9') {
×
148
    switch (sink.back()) {
×
149
      case 'k':
150
      case 'K':
151
        unit = 1000;
152
        break;
153

154
      case 'm':
×
155
      case 'M':
×
156
        unit = 1000'000;
×
157
        break;
×
158

159
      case 'g':
×
160
      case 'G':
×
161
        unit = 1000'000'000;
×
162
        break;
×
163

164
      default:
×
165
        log::error() << "Invalid unit in command-line option --sink "
×
166
                     << shell_escape(sink);
×
167
        return false;
×
168
    }
169

170
    sink_without_unit.pop_back();
×
171
  }
172

173
  try {
×
174
    // This should not be able to overflow as log2(2^32 * 1e9) ~= 62,
175
    // but will need to be changed if we want to allow large raw numbers
176
    out = static_cast<uint64_t>(str_to_unsigned(sink_without_unit)) * unit;
×
177
  } catch (const std::invalid_argument&) {
×
178
    log::error() << "Invalid value in command-line option --sink "
×
179
                 << shell_escape(sink);
×
180
    return false;
×
181
  }
×
182

183
  return true;
×
184
}
185

186
bool
187
check_no_clobber(const std::string& label,
×
188
                 const string_vec& in_files,
189
                 const output_file& out_file)
190
{
191
  for (const auto& in_file : in_files) {
×
192
    if (in_file == out_file.name && in_file != DEV_NULL) {
×
193
      log::error() << "Input file would be overwritten: " << label << " "
×
194
                   << in_file;
×
195
      return false;
×
196
    }
197
  }
198

199
  return true;
×
200
}
201

202
/** Replace the STDIN pseudo-filename with the device path */
203
void
204
normalize_input_file(std::string& filename)
×
205
{
206
  if (filename == "-") {
×
207
    filename = "/dev/stdin";
×
208
  }
209
}
210

211
/** Replace the STDIN pseudo-filename with the device path */
212
void
213
normalize_output_file(std::string& filename)
×
214
{
215
  if (filename == "-") {
×
216
    filename = "/dev/stdout";
×
217
  }
218
}
219

220
bool
221
check_input_and_output(const std::string& label,
×
222
                       const string_vec& filenames,
223
                       const output_files& output_files)
224
{
225
  for (const auto& filename : filenames) {
×
226
    if (access(filename.c_str(), R_OK)) {
×
227
      log::error() << "Cannot read file: " << label << " " << filename
×
228
                   << "': " << std::strerror(errno);
×
229

230
      return false;
×
231
    }
232
  }
233

234
  if (!check_no_clobber(label, filenames, output_files.unidentified_1)) {
×
235
    return false;
236
  }
237

238
  if (!check_no_clobber(label, filenames, output_files.unidentified_2)) {
×
239
    return false;
240
  }
241

242
  for (const auto& sample : output_files.samples()) {
×
243
    for (size_t i = 0; i < sample.size(); ++i) {
×
244
      if (!check_no_clobber(label, filenames, sample.file(i))) {
×
245
        return false;
×
246
      }
247
    }
248
  }
249

250
  return true;
×
251
}
252

253
/**
254
 * Tries to parse a simple command-line argument while ignoring the validity
255
 * of the overall command-line. This is only intended to make pre-configured
256
 * logging output consistent with post-configured output if possible.
257
 */
258
std::string
259
try_parse_argument(const string_vec& args,
×
260
                   const std::string& key,
261
                   const std::string& fallback)
262
{
263
  auto it = std::find(args.begin(), args.end(), key);
×
264
  if (it != args.end() && (it + 1) != args.end()) {
×
265
    return *(it + 1);
×
266
  }
267

268
  return fallback;
×
269
}
270

271
/** Returns vector of keys for output files that have been set by the user. */
272
string_vec
273
user_supplied_keys(const argparse::parser& argparser, const string_vec& keys)
×
274
{
275
  string_vec result;
×
276
  for (const auto& key : keys) {
×
277
    if (argparser.is_set(key)) {
×
278
      result.push_back(key);
×
279
    }
280
  }
281

282
  return result;
×
283
}
×
284

285
////////////////////////////////////////////////////////////////////////////////
286

287
bool
288
fancy_output_allowed()
×
289
{
290
  if (::isatty(STDERR_FILENO)) {
×
291
    // NO_COLOR is checked as suggested by https://no-color.org/
292
    const char* no_color = std::getenv("NO_COLOR");
×
293
    const char* term = std::getenv("TERM");
×
294

295
    return !(no_color && no_color[0] != '\0') &&
×
296
           !(term && strcmp(term, "dumb") == 0);
×
297
  }
298

299
  return false;
300
}
301

302
void
303
configure_log_levels(const std::string& value, bool fallible = false)
×
304
{
305
  const auto log_level = to_lower(value);
×
306

307
  if (log_level == "debug") {
×
308
    log::set_level(log::level::debug);
×
309
  } else if (log_level == "info") {
×
310
    log::set_level(log::level::info);
×
311
  } else if (log_level == "warning") {
×
312
    log::set_level(log::level::warning);
×
313
  } else if (log_level == "error") {
×
314
    log::set_level(log::level::error);
×
315
  } else {
316
    AR_REQUIRE(fallible, "unhandled log_level value");
×
317
  }
318
}
319

320
void
321
configure_log_colors(const std::string& colors, bool fallible = false)
×
322
{
323
  if (colors == "always") {
×
324
    log::set_colors(true);
×
325
  } else if (colors == "never") {
×
326
    log::set_colors(false);
×
327
  } else if (colors == "auto") {
×
328
    log::set_colors(fancy_output_allowed());
×
329
  } else {
330
    AR_REQUIRE(fallible, "unhandled log_colors value");
×
331
  }
332
}
333

334
progress_type
335
configure_log_progress(const std::string& progress)
×
336
{
337
  if (progress == "never") {
×
338
    return progress_type::none;
339
  } else if (progress == "spin") {
×
340
    return progress_type::spinner;
341
  } else if (progress == "log") {
×
342
    return progress_type::simple;
343
  } else if (progress == "auto") {
×
344
    if (fancy_output_allowed()) {
×
345
      return progress_type::spinner;
346
    } else {
347
      return progress_type::simple;
×
348
    }
349
  }
350

351
  AR_FAIL("unhandled log_progress value");
×
352
}
353

354
fastq_encoding
355
configure_encoding(const std::string& value,
×
356
                   degenerate_encoding degenerate,
357
                   uracil_encoding uracils)
358
{
359
  if (value == "33") {
×
360
    return fastq_encoding{ quality_encoding::phred_33, degenerate, uracils };
×
361
  } else if (value == "64") {
×
362
    return fastq_encoding{ quality_encoding::phred_64, degenerate, uracils };
×
363
  } else if (value == "solexa") {
×
364
    return fastq_encoding{ quality_encoding::solexa, degenerate, uracils };
×
365
  } else if (value == "sam") {
×
366
    return fastq_encoding{ quality_encoding::sam, degenerate, uracils };
×
367
  }
368

369
  AR_FAIL("unhandled qualitybase value");
×
370
}
371

372
bool
373
parse_output_formats(const argparse::parser& argparser,
×
374
                     output_format& file_format,
375
                     output_format& stdout_format)
376
{
377
  if (argparser.is_set("--gzip")) {
×
378
    file_format = stdout_format = output_format::fastq_gzip;
×
379
    return true;
×
380
  }
381

382
  auto format_s = argparser.value("--out-format");
×
383
  if (!output_files::parse_format(format_s, file_format)) {
×
384
    log::error() << "Invalid output format " + log_escape(format_s);
×
385
    return false;
×
386
  }
387

388
  // Default to writing uncompressed output to STDOUT
389
  if (!argparser.is_set("--stdout-format")) {
×
390
    switch (file_format) {
×
391
      case output_format::fastq:
×
392
      case output_format::fastq_gzip:
×
393
        stdout_format = output_format::fastq;
×
394
        return true;
×
395
      case output_format::sam:
×
396
      case output_format::sam_gzip:
×
397
        stdout_format = output_format::sam;
×
398
        return true;
×
399
      case output_format::bam:
×
400
      case output_format::ubam:
×
401
        stdout_format = output_format::ubam;
×
402
        return true;
×
403
      default:
×
404
        AR_FAIL("invalid output format");
×
405
    }
406
  }
407

408
  format_s = argparser.value("--stdout-format");
×
409
  if (!output_files::parse_format(format_s, stdout_format)) {
×
410
    log::error() << "Invalid output format " + log_escape(format_s);
×
411
    return false;
×
412
  }
413

414
  return true;
415
}
416

417
} // namespace
418

419
////////////////////////////////////////////////////////////////////////////////
420
// Implementations for `userconfig`
421

422
std::string userconfig::start_time = timestamp("%FT%T%z");
423

424
userconfig::userconfig()
×
425
{
426
  argparser.set_name(NAME);
×
427
  argparser.set_version(VERSION);
×
428
  argparser.set_preamble(HELPTEXT);
×
429
  argparser.set_licenses(LICENSES);
×
430
  argparser.set_terminal_width(log::get_terminal_width());
×
431

432
  //////////////////////////////////////////////////////////////////////////////
433
  argparser.add("--threads", "N")
×
434
    .help("Maximum number of threads")
×
435
    .bind_uint(&max_threads)
×
436
    .with_default(2);
×
437

438
  {
×
439
    std::vector<std::string> choices;
×
440
    for (const auto is : simd::supported()) {
×
441
      choices.emplace_back(simd::name(is));
×
442
    }
443

444
    AR_REQUIRE(!choices.empty());
×
445
    argparser.add("--simd", "NAME")
×
446
      .help("SIMD instruction set to use; defaults to the most advanced "
×
447
            "instruction set supported by this computer")
448
      .bind_str(nullptr)
×
449
      .with_choices(choices)
×
450
      .with_default(choices.back());
×
451
  }
452

453
  argparser.add("--benchmark")
×
454
    .help("Carry out benchmarking of AdapterRemoval sub-systems")
×
455
    .conflicts_with("--demultiplex-only")
×
456
    .conflicts_with("--report-only")
×
457
    .conflicts_with("--interleaved")
×
458
    .conflicts_with("--interleaved-input")
×
459
#if !defined(DEBUG)
460
    .hidden()
×
461
#endif
462
    .bind_vec(&benchmarks)
×
463
    .with_min_values(0);
×
464

465
  //////////////////////////////////////////////////////////////////////////////
466
  argparser.add_header("INPUT FILES:");
×
467

468
  argparser.add("--in-file1", "FILE")
×
469
    .help("One or more input files containing mate 1 reads [REQUIRED]")
×
470
    .deprecated_alias("--file1")
×
471
    .bind_vec(&input_files_1)
×
472
    .with_preprocessor(normalize_input_file);
×
473
  argparser.add("--in-file2", "FILE")
×
474
    .help("Input files containing mate 2 reads; if used, then the same number "
×
475
          "of files as --in-file1 must be listed [OPTIONAL]")
476
    .deprecated_alias("--file2")
×
477
    .bind_vec(&input_files_2)
×
478
    .with_preprocessor(normalize_input_file);
×
479
  argparser.add("--head", "N")
×
480
    .help("Process only the first N reads in single-end mode or the first N "
×
481
          "read-pairs in paired-end mode. Accepts suffixes K (thousands), M "
482
          "(millions), and G (billions) [default: all reads]")
483
    .bind_str(nullptr);
×
484

485
  //////////////////////////////////////////////////////////////////////////////
486
  argparser.add_header("OUTPUT FILES:");
×
487

488
  argparser.add("--out-prefix", "PREFIX")
×
489
    .help("Prefix for output files for which the corresponding --out option "
×
490
          "was not set [default: not set]")
491
    .deprecated_alias("--basename")
×
492
    .bind_str(&out_prefix)
×
493
    .with_default("/dev/null");
×
494

495
  argparser.add_separator();
×
496
  argparser.add("--out-file1", "FILE")
×
497
    .help("Output file containing trimmed mate 1 reads. Setting this value in "
×
498
          "in demultiplexing mode overrides --out-prefix for this file")
499
    .deprecated_alias("--output1")
×
500
    .bind_str(nullptr)
×
501
    .with_default("{prefix}[.sample].r1.fastq")
×
502
    .with_preprocessor(normalize_output_file);
×
503
  argparser.add("--out-file2", "FILE")
×
504
    .help("Output file containing trimmed mate 2 reads. Setting this value in "
×
505
          "in demultiplexing mode overrides --out-prefix for this file")
506
    .deprecated_alias("--output2")
×
507
    .bind_str(nullptr)
×
508
    .with_default("{prefix}[.sample].r2.fastq")
×
509
    .with_preprocessor(normalize_output_file);
×
510
  argparser.add("--out-merged", "FILE")
×
511
    .help("Output file that, if --merge is set, contains overlapping "
×
512
          "read-pairs that have been merged into a single read (PE mode only). "
513
          "Setting this value in demultiplexing mode overrides --out-prefix "
514
          "for this file")
515
    .deprecated_alias("--outputcollapsed")
×
516
    .bind_str(nullptr)
×
517
    .with_default("{prefix}[.sample].merged.fastq")
×
518
    .with_preprocessor(normalize_output_file);
×
519
  argparser.add("--out-singleton", "FILE")
×
520
    .help("Output file containing paired reads for which the mate "
×
521
          "has been discarded. This file is only created if filtering is "
522
          "enabled. Setting this value in demultiplexing mode overrides "
523
          "--out-prefix for this file")
524
    .deprecated_alias("--singleton")
×
525
    .bind_str(nullptr)
×
526
    .with_default("{prefix}[.sample].singleton.fastq")
×
527
    .with_preprocessor(normalize_output_file);
×
528

529
  argparser.add_separator();
×
530
  argparser.add("--out-unidentified1", "FILE")
×
531
    .help("In demultiplexing mode, contains mate 1 reads that could not be "
×
532
          "assigned to a single sample")
533
    .bind_str(nullptr)
×
534
    .with_default("{prefix}.unidentified.r1.fastq")
×
535
    .with_preprocessor(normalize_output_file);
×
536
  argparser.add("--out-unidentified2", "FILE")
×
537
    .help("In demultiplexing mode, contains mate 2 reads that could not be "
×
538
          "assigned to a single sample")
539
    .bind_str(nullptr)
×
540
    .with_default("{prefix}.unidentified.r2.fastq")
×
541
    .with_preprocessor(normalize_output_file);
×
542
  argparser.add("--out-discarded", "FILE")
×
543
    .help("Output file containing filtered reads. Setting this value in "
×
544
          "demultiplexing mode overrides --out-prefix for this file [default: "
545
          "not saved]")
546
    .deprecated_alias("--discarded")
×
547
    .bind_str(nullptr)
×
548
    .with_preprocessor(normalize_output_file);
×
549

550
  argparser.add_separator();
×
551
  argparser.add("--out-json", "FILE")
×
552
    .help("Output file containing statistics about input files, trimming, "
×
553
          "merging, and more in JSON format")
554
    .bind_str(nullptr)
×
555
    .with_default("{prefix}.json")
×
556
    .with_preprocessor(normalize_output_file);
×
557
  argparser.add("--out-html", "FILE")
×
558
    .help("Output file containing statistics about input files, trimming, "
×
559
          "merging, and more in HTML format")
560
    .bind_str(nullptr)
×
561
    .with_default("{prefix}.html")
×
562
    .with_preprocessor(normalize_output_file);
×
563

564
  //////////////////////////////////////////////////////////////////////////////
565
  argparser.add_header("FASTQ OPTIONS:");
×
566

567
  argparser.add("--quality-format", "N")
×
568
    .help("Format used to encode Phred scores in input")
×
569
    .deprecated_alias("--qualitybase")
×
570
    .bind_str(&quality_input_base)
×
571
    .with_choices({ "33", "64", "solexa", "sam" })
×
572
    .with_default("33");
×
573
  argparser.add("--mate-separator", "CHAR")
×
574
    .help("Character separating the mate number (1 or 2) from the read name in "
×
575
          "FASTQ records. Will be determined automatically if not specified")
576
    .bind_str(&mate_separator_str);
×
577

578
  argparser.add("--interleaved-input")
×
579
    .help("The (single) input file provided contains both the mate 1 and mate "
×
580
          "2 reads, one pair after the other, with one mate 1 reads followed "
581
          "by one mate 2 read. This option is implied by the --interleaved "
582
          "option")
583
    .conflicts_with("--in-file2")
×
584
    .bind_bool(&interleaved_input);
×
585
  argparser.add("--interleaved-output")
×
586
    .help("If set, trimmed paired-end reads are written to a single file "
×
587
          "containing mate 1 and mate 2 reads, one pair after the other. This "
588
          "option is implied by the --interleaved option")
589
    .conflicts_with("--out-file2")
×
590
    .bind_bool(&interleaved_output);
×
591
  argparser.add("--interleaved")
×
592
    .help("This option enables both the --interleaved-input option and the "
×
593
          "--interleaved-output option")
594
    .conflicts_with("--in-file2")
×
595
    .conflicts_with("--out-file2")
×
596
    .bind_bool(&interleaved);
×
597

598
  argparser.add("--mask-degenerate-bases")
×
599
    .help("Mask degenerate/ambiguous bases (B/D/H/K/M/N/R/S/V/W/Y) in the "
×
600
          "input by replacing them with an 'N'; if this option is not used, "
601
          "AdapterRemoval will abort upon encountering degenerate bases.");
602
  argparser.add("--convert-uracils")
×
603
    .help("Convert uracils (U) to thymine (T) in input reads; if this option "
×
604
          "is not used, AdapterRemoval will abort upon encountering uracils.");
605

606
  //////////////////////////////////////////////////////////////////////////////
607
  argparser.add_header("OUTPUT FORMAT:");
×
608

609
  argparser.add("--gzip")
×
610
    .hidden()
×
611
    .deprecated()
×
612
    .conflicts_with("--out-format")
×
613
    .conflicts_with("--stdout-format");
×
614
  argparser.add("--out-format", "X")
×
615
    .help("Selects the default output format; either 'fastq' for uncompressed "
×
616
          "FASTQ reads, 'fastq.gz' for gzip compressed FASTQ reads, 'sam' for "
617
          "uncompressed SAM records, 'sam.gz' for gzip compressed SAM records, "
618
          "'bam' for BGZF compressed BAM records, and 'ubam' for uncompressed "
619
          "BAM records. Setting an `--out-*` option overrides this option "
620
          "based on the filename used (except .ubam)")
621
    .bind_str(nullptr)
×
622
    .with_choices({ "fastq", "fastq.gz", "sam", "sam.gz", "bam", "ubam" })
×
623
    .with_default("fastq.gz");
×
624
  argparser.add("--stdout-format", "X")
×
625
    .help("Selects the output format for data written to STDOUT; choices are "
×
626
          "the same as for --out-format [default: the same format as "
627
          "--out-format, but uncompressed]")
628
    .bind_str(nullptr)
×
629
    .with_choices({ "fastq", "fastq.gz", "sam", "sam.gz", "bam", "ubam" });
×
630
  argparser.add("--read-group", "RG")
×
631
    .help("Add read-group (RG) information to SAM/BAM output. The value is "
×
632
          "expected to be a valid set of read-group tags separated by tabs, "
633
          "for example \"ID:DS-1\\tSM:TK-421\\tPL:ILLUMINA\". If the ID tag is "
634
          "not provided, the default ID \"1\" will be used")
635
    .bind_str(nullptr);
×
636
  argparser.add("--compression-level", "N")
×
637
    .help(
×
638
      "Sets the compression level for compressed output. Valid values are 0 to "
639
      "13: Level 0 is uncompressed but includes gzip headers/checksums, level "
640
      "1 is streamed for SAM/FASTQ output (this may be required in rare cases "
641
      "for compatibility), and levels 2 to 13 are block compressed using the "
642
      "BGZF format")
643
    .deprecated_alias("--gzip-level")
×
644
    .bind_uint(&compression_level)
×
645
    .with_default(5);
×
646

647
  //////////////////////////////////////////////////////////////////////////////
648
  argparser.add_header("PROCESSING:");
×
649

650
  argparser.add("--adapter1", "SEQ")
×
651
    .help("Adapter sequence expected to be found in mate 1 reads. Any 'N' in "
×
652
          "this sequence is treated as a wildcard")
653
    .bind_str(&adapter_1)
×
654
    .with_default("AGATCGGAAGAGCACACGTCTGAACTCCAGTCA");
×
655
  argparser.add("--adapter2", "SEQ")
×
656
    .help("Adapter sequence expected to be found in mate 2 reads. Any 'N' in "
×
657
          "this sequence is treated as a wildcard")
658
    .bind_str(&adapter_2)
×
659
    .with_default("AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT");
×
660
  argparser.add("--adapter-list", "FILE")
×
661
    .help("Read adapter pairs from the first two columns of a white-space "
×
662
          "separated table. AdapterRemoval will then select the best matching "
663
          "adapter pair for each pair of input reads when trimming. Only the "
664
          "first column is required for single-end trimming")
665
    .conflicts_with("--adapter1")
×
666
    .conflicts_with("--adapter2")
×
667
    .bind_str(&adapter_list);
×
668

669
  argparser.add_separator();
×
670
  argparser.add("--min-adapter-overlap", "N")
×
671
    .help("In single-end mode, reads are only trimmed if the overlap between "
×
672
          "read and the adapter is at least X bases long, not counting "
673
          "ambiguous nucleotides (Ns)")
674
    .deprecated_alias("--minadapteroverlap")
×
675
    .bind_uint(&min_adapter_overlap)
×
676
    .with_default(0);
×
677
  argparser.add("--mismatch-rate", "X")
×
678
    .help("Max error-rate when aligning reads and/or adapters. If > 1, the max "
×
679
          "error-rate is set to 1 / X; if < 0, the defaults are used, "
680
          "otherwise the user-supplied value is used directly [default: 1/6 "
681
          "for trimming; 1/10 when identifying adapters]")
682
    .deprecated_alias("--mm")
×
683
    .bind_double(&mismatch_threshold)
×
684
    .with_default(-1.0);
×
685
  argparser.add("--shift", "N")
×
686
    .help("Consider alignments where up to N nucleotides are missing from the "
×
687
          "5' termini")
688
    .bind_uint(&shift)
×
689
    .with_default(2);
×
690

691
  argparser.add_separator();
×
692
  argparser.add("--merge")
×
693
    .help("When set, paired ended read alignments of --merge-threshold or "
×
694
          "more bases are merged into a single consensus sequence. Merged "
695
          "reads are written to prefix.merged by default. Has no effect "
696
          "in single-end mode")
697
    .deprecated_alias("--collapse");
×
698
  argparser.add("--merge-threshold", "N")
×
699
    .help("Paired reads must overlap at least this many bases to be considered "
×
700
          "overlapping for the purpose of read merging")
701
    .deprecated_alias("--minalignmentlength")
×
702
    .bind_uint(&merge_threshold)
×
703
    .with_default(11);
×
704
  argparser.add("--merge-strategy", "X")
×
705
    .help(
×
706
      "The 'maximum' strategy uses Q=max(Q1,Q2) for matches while the "
707
      "'additive' strategy uses Q=Q1+Q2. Both strategies use Q=abs(Q1-Q2) for "
708
      "mismatches and picks the highest quality base, unless the qualities are "
709
      "the same in which case 'N' is used. Setting this option implies --merge")
710
    .bind_str(nullptr)
×
711
    .with_choices({ "maximum", "additive" })
×
712
    .with_default("maximum");
×
713
  argparser.add("--merge-quality-max", "N")
×
714
    .help("Sets the maximum Phred score for re-calculated quality scores when "
×
715
          "read merging is enabled with the 'additive' merging strategy")
716
    .deprecated_alias("--qualitymax")
×
717
    .bind_uint(&merge_quality_max)
×
718
    .with_default(41);
×
719
  argparser.add("--collapse-deterministic")
×
720
    .conflicts_with("--collapse-conservatively")
×
721
    .conflicts_with("--merge-strategy")
×
722
    .deprecated();
×
723
  argparser.add("--collapse-conservatively")
×
724
    .conflicts_with("--collapse-deterministic")
×
725
    .conflicts_with("--merge-strategy")
×
726
    .deprecated();
×
727

728
  argparser.add_separator();
×
729
  argparser.add("--prefix-read1", "X")
×
730
    .help("Adds the specified prefix to read 1 names [default: no prefix]")
×
731
    .bind_str(&prefix_read_1);
×
732
  argparser.add("--prefix-read2", "X")
×
733
    .help("Adds the specified prefix to read 2 names [default: no prefix]")
×
734
    .bind_str(&prefix_read_2);
×
735
  argparser.add("--prefix-merged", "X")
×
736
    .help("Adds the specified prefix to merged read names [default: no prefix]")
×
737
    .bind_str(&prefix_merged);
×
738

739
  //////////////////////////////////////////////////////////////////////////////
740
  argparser.add_header("QUALITY TRIMMING:");
×
741

742
#ifdef PRE_TRIM_5P
743
  argparser.add("--pre-trim5p", "N")
744
    .help("Trim the 5' of reads by a fixed amount after demultiplexing (if "
745
          "enabled) but before trimming adapters and low quality bases. "
746
          "Specify one value to trim mate 1 and mate 2 reads the same amount, "
747
          "or two values separated by a space to trim each mate a different "
748
          "amount [default: no trimming]")
749
    .bind_vec(&pre_trim5p)
750
    .with_max_values(2);
751
#endif
752
  argparser.add("--pre-trim3p", "N")
×
753
    .help("Trim the 3' of reads by a fixed amount after demultiplexing (if "
×
754
          "enabled) but before trimming adapters and low quality bases. "
755
          "Specify one value to trim mate 1 and mate 2 reads the same amount, "
756
          "or two values separated by a space to trim each mate a different "
757
          "amount [default: no trimming]")
758
    .bind_vec(&pre_trim3p)
×
759
    .with_max_values(2);
×
760

761
  argparser.add("--post-trim5p", "N")
×
762
    .help("Trim the 5' by a fixed amount after removing adapters, but before "
×
763
          "carrying out quality based trimming [default: no trimming]")
764
    .deprecated_alias("--trim5p")
×
765
    .bind_vec(&post_trim5p)
×
766
    .with_max_values(2);
×
767
  argparser.add("--post-trim3p", "N")
×
768
    .deprecated_alias("--trim3p")
×
769
    .help("Trim the 3' by a fixed amount after removing adapters, but before "
×
770
          "carrying out quality based trimming [default: no trimming]")
771
    .bind_vec(&post_trim3p)
×
772
    .with_max_values(2);
×
773

774
  argparser.add_separator();
×
775
  argparser.add("--quality-trimming", "method")
×
776
    .help("Strategy for trimming low quality bases: 'mott' for the modified "
×
777
          "Mott's algorithm; 'window' for window based trimming; 'per-base' "
778
          "for a per-base trimming of low quality base; and 'none' for no "
779
          "trimming of low quality bases")
780
    .bind_str(nullptr)
×
781
    .with_choices({ "mott", "window", "per-base", "none" })
×
782
    .with_default("mott");
×
783

784
  argparser.add("--trim-mott-rate", "X")
×
785
    .help("The threshold value used when performing trimming quality based "
×
786
          "trimming using the modified Mott's algorithm. A value of zero or "
787
          "less disables trimming; a value greater than one is assumed to be "
788
          "a Phred encoded error rate (e.g. 13 ~= 0.05)")
789
    .conflicts_with("--trim-windows")
×
790
    .conflicts_with("--trim-ns")
×
791
    .conflicts_with("--trim-qualities")
×
792
    .conflicts_with("--trim-min-quality")
×
793
    .bind_double(&trim_mott_rate)
×
794
    .with_default(0.05);
×
795
  argparser.add("--trim-windows", "X")
×
796
    .help("Specifies the size of the window used for '--quality-trimming "
×
797
          "window': If >= 1, this value will be used as the window size; if "
798
          "the value is < 1, window size is the read length times this value. "
799
          "If the resulting window size is 0 or larger than the read length, "
800
          "the read length is used as the window size")
801
    .deprecated_alias("--trimwindows")
×
802
    .conflicts_with("--trim-mott-rate")
×
803
    .conflicts_with("--trim-qualities")
×
804
    .bind_double(&trim_window_length)
×
805
    .with_default(0.1);
×
806
  argparser.add("--trim-min-quality", "N")
×
807
    .help("Inclusive minimum quality used when trimming low-quality bases with "
×
808
          "--quality-trimming options 'window' and 'per-base'")
809
    .deprecated_alias("--minquality")
×
810
    .conflicts_with("--trim-mott-rate")
×
811
    .bind_uint(&trim_quality_score)
×
812
    .with_default(2);
×
813
  argparser.add("--trim-ns")
×
814
    .help("If set, trim ambiguous bases (N) at 5'/3' termini when using the "
×
815
          "'window' or the 'per-base' trimming strategy")
816
    .conflicts_with("--trim-mott-rate")
×
817
    .deprecated_alias("--trimns")
×
818
    .bind_bool(&trim_ambiguous_bases);
×
819
  argparser.add("--trim-qualities")
×
820
    .help("If set, trim low-quality bases (< --trim-min-quality) when using "
×
821
          "the 'per-base' trimming strategy")
822
    .deprecated_alias("--trimqualities")
×
823
    .conflicts_with("--trim-mott-rate")
×
824
    .conflicts_with("--trim-windows")
×
825
    .bind_bool(&trim_low_quality_bases);
×
826

827
  argparser.add_separator();
×
828
  argparser.add("--pre-trim-polyx", "X")
×
829
    .help("Enable trimming of poly-X tails prior to read alignment and adapter "
×
830
          "trimming. Zero or more nucleotides (A, C, G, T) may be specified. "
831
          "Zero or more nucleotides may be specified after the option "
832
          "separated by spaces, with zero nucleotides corresponding to all of "
833
          "A, C, G, and T")
834
    .bind_vec(&pre_trim_poly_x_sink)
×
835
    .with_min_values(0);
×
836
  argparser.add("--post-trim-polyx", "X")
×
837
    .help("Enable trimming of poly-X tails after read alignment and adapter "
×
838
          "trimming/merging, but before trimming of low-quality bases. Merged "
839
          "reads are not trimmed by this option (both ends are 5'). Zero or "
840
          "more nucleotides (A, C, G, T) may be specified. Zero or more "
841
          "nucleotides may be specified after the option separated by spaces, "
842
          "with zero nucleotides corresponding to all of A, C, G, and T")
843
    .bind_vec(&post_trim_poly_x_sink)
×
844
    .with_min_values(0);
×
845
  argparser.add("--trim-polyx-threshold", "N")
×
846
    .help("The minimum number of bases in a poly-X tail")
×
847
    .bind_uint(&trim_poly_x_threshold)
×
848
    .with_default(10);
×
849

850
  argparser.add_separator();
×
851
  argparser.add("--preserve5p")
×
852
    .help("If set, bases at the 5p will not be trimmed by when performing "
×
853
          "quality based trimming of reads. Merged reads will not be quality "
854
          "trimmed when this option is enabled [default: 5p bases are trimmed]")
855
    .bind_bool(&preserve5p);
×
856

857
  //////////////////////////////////////////////////////////////////////////////
858
  argparser.add_header("FILTERING:");
×
859

860
  argparser.add("--max-ns", "N")
×
861
    .help("Reads containing more ambiguous bases (N) than this number after "
×
862
          "trimming are discarded [default: no maximum]")
863
    .deprecated_alias("--maxns")
×
864
    .bind_uint(&max_ambiguous_bases)
×
865
    .with_default(std::numeric_limits<unsigned>::max());
×
866

867
  argparser.add("--min-length", "N")
×
868
    .help("Reads shorter than this length following trimming are discarded")
×
869
    .deprecated_alias("--minlength")
×
870
    .bind_uint(&min_genomic_length)
×
871
    .with_default(15);
×
872
  argparser.add("--max-length", "N")
×
873
    .help("Reads longer than this length following trimming are discarded "
×
874
          "[default: no maximum]")
875
    .deprecated_alias("--maxlength")
×
876
    .bind_uint(&max_genomic_length)
×
877
    .with_default(std::numeric_limits<unsigned>::max());
×
878

879
  argparser.add("--min-mean-quality", "N")
×
880
    .help("Reads with a mean Phred encoded quality score (typically 0 to 42) "
×
881
          "less than this value following trimming are discarded [default: no "
882
          "minimum]")
883
    .bind_double(&min_mean_quality)
×
884
    .with_default(0.0);
×
885

886
  argparser.add("--min-complexity", "X")
×
887
    .help(
×
888
      "Filter reads with a complexity score less than this value. Complexity "
889
      "is measured as the fraction of positions that differ from the previous "
890
      "position. A suggested value is 0.3 [default: no minimum]")
891
    .bind_double(&min_complexity)
×
892
    .with_default(0);
×
893

894
  //////////////////////////////////////////////////////////////////////////////
895
  argparser.add_header("DEMULTIPLEXING:");
×
896

897
  argparser.add("--barcode-list", "FILE")
×
898
    .help("List of barcodes or barcode pairs for single or double-indexed "
×
899
          "demultiplexing. Note that both indexes should be specified for "
900
          "both single-end and paired-end trimming, if double-indexed "
901
          "multiplexing was used, in order to ensure that the demultiplexed "
902
          "reads can be trimmed correctly")
903
    .bind_str(&barcode_list);
×
904
  argparser.add("--multiple-barcodes")
×
905
    .help("Allow for more than one barcode (pair) for each sample. If this "
×
906
          "option is not specified, AdapterRemoval will abort if "
907
          "barcodes/barcode pairs do not to unique samples");
908
  argparser.add("--reversible-barcodes")
×
909
    .help("If set, it is assumed that barcodes can be sequences in both the "
×
910
          "barcode1-insert-barcode2 orientation and barcode2'-insert-barcode1' "
911
          "orientation, where ' indicates reverse complementation. This option "
912
          "requires two barcodes per sample (double-indexing)");
913

914
  argparser.add_separator();
×
915
  argparser.add("--barcode-mm", "N")
×
916
    .help("Maximum number of mismatches allowed when counting mismatches in "
×
917
          "both the mate 1 and the mate 2 barcode for paired reads")
918
    .bind_uint(&barcode_mm)
×
919
    .with_default(0);
×
920
  argparser.add("--barcode-mm-r1", "N")
×
921
    .help("Maximum number of mismatches allowed for the mate 1 barcode. "
×
922
          "Cannot be higher than the --barcode-mm value [default: same value "
923
          "as --barcode-mm]")
924
    .bind_uint(&barcode_mm_r1)
×
925
    .with_default(0);
×
926
  argparser.add("--barcode-mm-r2", "N")
×
927
    .help("Maximum number of mismatches allowed for the mate 2 barcode. "
×
928
          "Cannot be higher than the --barcode-mm value [default: same value "
929
          "as --barcode-mm]")
930
    .bind_uint(&barcode_mm_r2)
×
931
    .with_default(0);
×
932
  argparser.add("--demultiplex-only")
×
933
    .help("Only carry out demultiplexing using the list of barcodes "
×
934
          "supplied with --barcode-list. No other processing is done")
935
    .depends_on("--barcode-list")
×
936
    .conflicts_with("--report-only");
×
937

938
  //////////////////////////////////////////////////////////////////////////////
939
  argparser.add_header("REPORTS:");
×
940

941
  argparser.add("--report-only")
×
942
    .help("Write a report of the input data without performing any processing "
×
943
          "of the FASTQ reads. Adapter sequence inference is performed for PE "
944
          "data based on overlapping mate reads. A report including read "
945
          "processing, but without output, can be generated by setting "
946
          "--output options to /dev/null")
947
    .deprecated_alias("--identify-adapters")
×
948
    .conflicts_with("--barcode-list")
×
949
    .conflicts_with("--benchmark")
×
950
    .conflicts_with("--demultiplex-only");
×
951

952
  argparser.add("--report-sample-rate", "X")
×
953
    .help("Fraction of reads to use when generating base quality/composition "
×
954
          "curves for trimming reports. Using all data (--report-sample-nth "
955
          "1.0) results in an 10-30% decrease in throughput")
956
    .bind_double(&report_sample_rate)
×
957
    .with_default(0.1);
×
958
  argparser.add("--report-duplication", "N")
×
959
    .help("FastQC based duplicate detection, based on the frequency of the "
×
960
          "first N unique sequences observed. A value of 100,000 corresponds "
961
          "to FastQC defaults; a value of 0 disables the analysis")
962
    .bind_uint(&report_duplication)
×
963
    .with_default(0);
×
964

965
  //////////////////////////////////////////////////////////////////////////////
966
  argparser.add_header("LOGGING:");
×
967

968
  argparser.add("--log-level", "X")
×
969
    .help("The minimum severity of messages to be written to STDERR")
×
970
    .bind_str(&log_level)
×
971
    .with_choices({ "debug", "info", "warning", "error" })
×
972
    .with_default("info");
×
973

974
  argparser.add("--log-colors", "X")
×
975
    .help("Enable/disable the use of colors when writing log messages. If set "
×
976
          "to auto, colors will only be enabled if STDERR is a terminal and "
977
          "the NO_COLORS is environmental variable is not set")
978
    .bind_str(&log_color)
×
979
    .with_choices({ "auto", "always", "never" })
×
980
    .with_default("auto");
×
981
  argparser.add("--log-progress", "X")
×
982
    .help("Specify the type of progress reports used. If set to auto, then a "
×
983
          "spinner will be used if STDERR is a terminal and the NO_COLORS "
984
          "environmental variable is not set, otherwise logging will be used")
985
    .bind_str(nullptr)
×
986
    .with_choices({ "auto", "log", "spin", "never" })
×
987
    .with_default("auto");
×
988
}
989

990
argparse::parse_result
991
userconfig::parse_args(const string_vec& argvec)
×
992
{
993
  args = argvec;
×
994
  if (args.size() <= 1) {
×
995
    argparser.print_help();
×
996
    return argparse::parse_result::error;
997
  }
998

999
  // ad-hoc arg parsing to make argparse output consistent with rest of run
1000
  configure_log_colors(try_parse_argument(args, "--log-color", "auto"), true);
×
1001
  configure_log_levels(try_parse_argument(args, "--log-level", "info"), true);
×
1002

1003
  const argparse::parse_result result = argparser.parse_args(args);
×
1004
  if (result != argparse::parse_result::ok) {
×
1005
    return result;
1006
  }
1007

1008
  configure_log_colors(log_color);
×
1009
  configure_log_levels(log_level);
×
1010
  log_progress = configure_log_progress(argparser.value("--log-progress"));
×
1011

1012
  {
×
1013
    const auto degenerate = argparser.is_set("--mask-degenerate-bases")
×
1014
                              ? degenerate_encoding::mask
×
1015
                              : degenerate_encoding::reject;
×
1016
    const auto uracils = argparser.is_set("--convert-uracils")
×
1017
                           ? uracil_encoding::convert
×
1018
                           : uracil_encoding::reject;
×
1019

1020
    io_encoding = configure_encoding(quality_input_base, degenerate, uracils);
×
1021
  }
1022

1023
  if (argparser.is_set("--mate-separator")) {
×
1024
    if (mate_separator_str.size() != 1) {
×
1025
      log::error() << "The argument for --mate-separator must be "
×
1026
                      "exactly one character long, not "
×
1027
                   << mate_separator_str.size() << " characters!";
×
1028
      return argparse::parse_result::error;
×
1029
    } else {
1030
      mate_separator = mate_separator_str.at(0);
×
1031
    }
1032
  }
1033

1034
  if (argparser.is_set("--demultiplex-only")) {
×
1035
    run_type = ar_command::demultiplex_only;
×
1036
  } else if (argparser.is_set("--report-only")) {
×
1037
    run_type = ar_command::report_only;
×
1038
  } else if (argparser.is_set("--benchmark")) {
×
1039
    run_type = ar_command::benchmark;
×
1040
  }
1041

1042
  if (trim_quality_score > static_cast<unsigned>(PHRED_SCORE_MAX)) {
×
1043
    log::error() << "--trim-min-quality must be in the range 0 to "
×
1044
                 << PHRED_SCORE_MAX << ", not " << trim_quality_score;
×
1045
    return argparse::parse_result::error;
×
1046
  } else if (trim_window_length < 0) {
×
1047
    log::error() << "--trim-windows must be greater than or equal to zero, not "
×
1048
                 << trim_window_length;
×
1049
    return argparse::parse_result::error;
×
1050
  } else if (trim_mott_rate < 0) {
×
1051
    log::error() << "--trim-mott-rate must be greater than or equal to zero, "
×
1052
                 << "not " << trim_window_length;
×
1053
    return argparse::parse_result::error;
×
1054
  } else {
1055
    const auto strategy = argparser.value("--quality-trimming");
×
1056
    if (strategy == "mott") {
×
1057
      trim = trimming_strategy::mott;
×
1058

1059
      if (trim_mott_rate > 1) {
×
1060
        trim_mott_rate = std::pow(10.0, trim_mott_rate / -10.0);
×
1061
      }
1062
    } else if (strategy == "window") {
×
1063
      trim = trimming_strategy::window;
×
1064
    } else if (strategy == "per-base") {
×
1065
      trim = trimming_strategy::per_base;
×
1066

1067
      if (!trim_low_quality_bases && !trim_ambiguous_bases) {
×
1068
        log::error() << "The per-base quality trimming strategy is enabled, "
×
1069
                     << "but neither trimming of low-quality bases (via "
×
1070
                     << "--trim-qualities) nor trimming of Ns (via --trim-ns) "
×
1071
                     << "is enabled.";
×
1072
        return argparse::parse_result::error;
×
1073
      }
1074
    } else if (strategy == "none") {
×
1075
      trim = trimming_strategy::none;
×
1076
    } else {
1077
      AR_FAIL(shell_escape(strategy));
×
1078
    }
1079
  }
1080

1081
  if (min_complexity < 0.0 || min_complexity > 1.0) {
×
1082
    log::error() << "--min-complexity must be a value in the range 0 to "
×
1083
                 << "1, not " << min_complexity;
×
1084
    return argparse::parse_result::error;
×
1085
  }
1086

1087
  // Check for invalid combinations of settings
1088
  if (input_files_1.empty() && input_files_2.empty()) {
×
1089
    log::error()
×
1090
      << "No input files (--in-file1 / --in-file2) specified.\n"
×
1091
      << "Please specify at least one input file using --in-file1 FILENAME.";
×
1092

1093
    return argparse::parse_result::error;
×
1094
  } else if (!input_files_2.empty() &&
×
1095
             (input_files_1.size() != input_files_2.size())) {
×
1096
    log::error()
×
1097
      << "Different number of files specified for --in-file1 and --in-file2.";
×
1098

1099
    return argparse::parse_result::error;
×
1100
  } else if (!input_files_2.empty()) {
×
1101
    paired_ended_mode = true;
×
1102
  }
1103

1104
  interleaved_input |= interleaved;
×
1105
  interleaved_output |= interleaved;
×
1106

1107
  if (interleaved_input) {
×
1108
    // Enable paired end mode .. other than the FASTQ reader, all other
1109
    // parts of the pipeline simply run in paired-end mode.
1110
    paired_ended_mode = true;
×
1111
  }
1112

1113
  if (paired_ended_mode) {
×
1114
    min_adapter_overlap = 0;
×
1115

1116
    // merge related options implies --merge
1117
    if (argparser.is_set("--collapse-deterministic")) {
×
1118
      merge = merge_strategy::additive;
×
1119
    } else if (argparser.is_set("--collapse-conservatively")) {
×
1120
      merge = merge_strategy::maximum;
×
1121
    } else if (argparser.is_set("--merge") ||
×
1122
               argparser.is_set("--merge-strategy")) {
×
1123
      const auto strategy = argparser.value("--merge-strategy");
×
1124
      if (strategy == "maximum") {
×
1125
        merge = merge_strategy::maximum;
×
1126
      } else if (strategy == "additive") {
×
1127
        merge = merge_strategy::additive;
×
1128
      } else {
1129
        AR_FAIL(strategy);
×
1130
      }
1131
    }
1132
  }
1133

1134
  // (Optionally) read adapters from file and validate
1135
  if (!setup_adapter_sequences()) {
×
1136
    return argparse::parse_result::error;
1137
  }
1138

1139
  // (Optionally) read barcodes from file and validate
1140
  if (!setup_demultiplexing()) {
×
1141
    return argparse::parse_result::error;
1142
  }
1143

1144
  try {
×
1145
    samples.set_read_group(argparser.value("--read-group"));
×
1146
  } catch (const std::invalid_argument& error) {
×
1147
    log::error() << "Invalid argument --read-group "
×
1148
                 << log_escape(argparser.value("--read-group")) << ": "
×
1149
                 << error.what();
×
1150

1151
    return argparse::parse_result::error;
×
1152
  }
×
1153

1154
  // Set mismatch threshold
1155
  if (mismatch_threshold > 1) {
×
1156
    mismatch_threshold = 1.0 / mismatch_threshold;
×
1157
  } else if (mismatch_threshold < 0) {
×
1158
    if (run_type == ar_command::report_only) {
×
1159
      mismatch_threshold = 1.0 / 10.0;
×
1160
    } else {
1161
      // Defaults for PE / SE trimming (changed in v3)
1162
      mismatch_threshold = 1.0 / 6.0;
×
1163
    }
1164
  }
1165

1166
  if (compression_level > 13) {
×
1167
    log::error() << "--compression-level must be in the range 0 to 13, not "
×
1168
                 << compression_level;
×
1169
    return argparse::parse_result::error;
×
1170
  }
1171

1172
  if (!max_threads) {
×
1173
    log::error() << "--threads must be at least 1!";
×
1174
    return argparse::parse_result::error;
×
1175
  }
1176

1177
  {
×
1178
    bool found = false;
×
1179
    const auto simd_choice = argparser.value("--simd");
×
1180
    for (const auto is : simd::supported()) {
×
1181
      if (simd_choice == simd::name(is)) {
×
1182
        simd = is;
×
1183
        found = true;
×
1184
        break;
×
1185
      }
1186
    }
1187

1188
    AR_REQUIRE(found);
×
1189
  }
1190

1191
  using fixed_trimming =
×
1192
    std::tuple<const char*, const string_vec&, std::pair<unsigned, unsigned>&>;
1193

1194
  const std::vector<fixed_trimming> fixed_trimming_options = {
×
1195
#ifdef PRE_TRIM_5P
1196
    { "--pre-trim5p", pre_trim5p, pre_trim_fixed_5p },
1197
#endif
1198
    { "--pre-trim3p", pre_trim3p, pre_trim_fixed_3p },
×
1199
    { "--post-trim5p", post_trim5p, post_trim_fixed_5p },
×
1200
    { "--post-trim3p", post_trim3p, post_trim_fixed_3p },
×
1201
  };
1202

1203
  for (const auto& it : fixed_trimming_options) {
×
1204
    try {
×
1205
      if (argparser.is_set(std::get<0>(it))) {
×
1206
        std::get<2>(it) = parse_trim_argument(std::get<1>(it));
×
1207
      }
1208
    } catch (const std::invalid_argument& error) {
×
1209
      log::error() << "Could not parse " << std::get<0>(it)
×
1210
                   << " argument(s): " << error.what();
×
1211

1212
      return argparse::parse_result::error;
×
1213
    }
×
1214
  }
1215

1216
  if (!parse_output_formats(argparser, out_file_format, out_stdout_format)) {
×
1217
    return argparse::parse_result::error;
1218
  }
1219

1220
  // An empty prefix or directory would results in the creation of dot-files
1221
  if (out_prefix.empty()) {
×
1222
    log::error() << "--out-prefix must be a non-empty value.";
×
1223

1224
    return argparse::parse_result::error;
×
1225
  } else if (out_prefix.back() == '/') {
×
1226
    log::error() << "--out-prefix must not be a directory: "
×
1227
                 << shell_escape(out_prefix);
×
1228

1229
    return argparse::parse_result::error;
×
1230
  } else if (out_prefix == DEV_NULL && run_type != ar_command::benchmark) {
×
1231
    // Relevant output options depend on input files and other settings
1232
    const std::vector<std::pair<std::string, bool>> output_keys = {
×
1233
      { "--out-file1",
1234
        is_adapter_trimming_enabled() || is_demultiplexing_enabled() },
×
1235
      { "--out-file2",
1236
        is_adapter_trimming_enabled() || is_demultiplexing_enabled() },
×
1237
      { "--out-singleton", is_any_filtering_enabled() },
×
1238
      { "--out-merged", is_read_merging_enabled() },
×
1239
      { "--out-discarded", is_any_filtering_enabled() },
×
1240
      { "--out-unidentified1", is_demultiplexing_enabled() },
×
1241
      { "--out-unidentified2", is_demultiplexing_enabled() },
×
1242
      { "--out-json", true },
1243
      { "--out-html", true },
1244
      { "--out-prefix", true },
1245
    };
1246

1247
    string_vec required_keys;
×
1248
    for (const auto& it : output_keys) {
×
1249
      if (it.second) {
×
1250
        required_keys.push_back(it.first);
×
1251
      }
1252
    }
1253

1254
    const auto user_keys = user_supplied_keys(argparser, required_keys);
×
1255
    if (user_keys.empty()) {
×
1256
      auto error = log::error();
×
1257
      error << "No output would be generated; at least one of the options "
×
1258
            << join_text(required_keys, ", ", ", or ")
×
1259
            << " must be used. The --out-prefix option automatically enables "
1260
               "all relevant --out options.";
×
1261

1262
      return argparse::parse_result::error;
×
1263
    }
1264
  }
1265

1266
  {
×
1267
    const std::string key = "--pre-trim-polyx";
×
1268
    if (argparser.is_set(key) &&
×
1269
        !parse_poly_x_option(key, pre_trim_poly_x_sink, pre_trim_poly_x)) {
×
1270
      return argparse::parse_result::error;
×
1271
    }
1272
  }
1273

1274
  {
×
1275
    const std::string key = "--post-trim-polyx";
×
1276
    if (argparser.is_set(key) &&
×
1277
        !parse_poly_x_option(key, post_trim_poly_x_sink, post_trim_poly_x)) {
×
1278
      return argparse::parse_result::error;
×
1279
    }
1280
  }
1281

1282
  if (!min_genomic_length) {
×
1283
    log::warn() << "--min-length is set to 0. This may produce FASTQ files "
×
1284
                   "that are incompatible with some tools!";
×
1285
  }
1286

1287
  if (!parse_head(argparser.value("--head"), head)) {
×
1288
    return argparse::parse_result::error;
×
1289
  }
1290

1291
  return argparse::parse_result::ok;
1292
}
1293

1294
bool
1295
userconfig::is_good_alignment(const alignment_info& alignment) const
×
1296
{
1297
  if (!alignment.length || alignment.score() <= 0) {
×
1298
    return false;
×
1299
  }
1300

1301
  // Only pairs of called bases are considered part of the alignment
1302
  const size_t n_aligned = alignment.length - alignment.n_ambiguous;
×
1303
  if (n_aligned < min_adapter_overlap && !paired_ended_mode) {
×
1304
    return false;
1305
  }
1306

1307
  auto mm_threshold = static_cast<size_t>(mismatch_threshold * n_aligned);
×
1308
  if (n_aligned < 6) {
×
1309
    mm_threshold = 0;
×
1310
  } else if (n_aligned < 10) {
×
1311
    // Allow at most 1 mismatch, possibly set to 0 by the user
1312
    mm_threshold = std::min<size_t>(1, mm_threshold);
×
1313
  }
1314

1315
  return alignment.n_mismatches <= mm_threshold;
×
1316
}
1317

1318
bool
1319
userconfig::can_merge_alignment(const alignment_info& alignment) const
×
1320
{
1321
  if (alignment.length < alignment.n_ambiguous) {
×
1322
    throw std::invalid_argument("#ambiguous bases > read length");
×
1323
  }
1324

1325
  return alignment.length - alignment.n_ambiguous >= merge_threshold;
×
1326
}
1327

1328
output_format
1329
userconfig::infer_output_format(const std::string& filename) const
×
1330
{
1331
  if (filename == "/dev/stdout") {
×
1332
    return out_stdout_format;
×
1333
  }
1334

1335
  output_format result = out_file_format;
×
1336
  // Parse failures are ignored here; default to --out-format
1337
  output_files::parse_extension(filename, result);
×
1338

1339
  return result;
×
1340
}
1341

1342
output_files
1343
userconfig::get_output_filenames() const
×
1344
{
1345
  output_files files;
×
1346

1347
  files.settings_json = new_output_file("--out-json", { ".json" }).name;
×
1348
  files.settings_html = new_output_file("--out-html", { ".html" }).name;
×
1349

1350
  const std::string ext{ output_files::file_extension(out_file_format) };
×
1351
  const std::string out1 = (interleaved_output ? "" : ".r1") + ext;
×
1352
  const std::string out2 = (interleaved_output ? "" : ".r2") + ext;
×
1353

1354
  if (is_demultiplexing_enabled()) {
×
1355
    files.unidentified_1 =
×
1356
      new_output_file("--out-unidentified1", { ".unidentified", out1 });
×
1357

1358
    if (paired_ended_mode) {
×
1359
      if (interleaved_output) {
×
1360
        files.unidentified_2 = files.unidentified_1;
×
1361
      } else {
1362
        files.unidentified_2 =
×
1363
          new_output_file("--out-unidentified2", { ".unidentified", out2 });
×
1364
      }
1365
    }
1366
  }
1367

1368
  for (const auto& sample : samples) {
×
1369
    const auto& name = sample.name();
×
1370
    sample_output_files map;
×
1371

1372
    const auto mate_1 = new_output_file("--out-file1", { name, out1 });
×
1373
    map.set_file(read_type::mate_1, mate_1);
×
1374

1375
    if (paired_ended_mode) {
×
1376
      if (interleaved_output) {
×
1377
        map.set_file(read_type::mate_2, mate_1);
×
1378
      } else {
1379
        map.set_file(read_type::mate_2,
×
1380
                     new_output_file("--out-file2", { name, out2 }));
×
1381
      }
1382
    }
1383

1384
    if (run_type == ar_command::trim_adapters) {
×
1385
      if (is_any_filtering_enabled()) {
×
1386
        map.set_file(
×
1387
          read_type::discarded,
1388
          new_output_file("--out-discarded", { name, ".discarded", ext }));
×
1389
      }
1390

1391
      if (paired_ended_mode) {
×
1392
        if (is_any_filtering_enabled()) {
×
1393
          map.set_file(
×
1394
            read_type::singleton,
1395
            new_output_file("--out-singleton", { name, ".singleton", ext }));
×
1396
        }
1397

1398
        if (is_read_merging_enabled()) {
×
1399
          map.set_file(
×
1400
            read_type::merged,
1401
            new_output_file("--out-merged", { name, ".merged", ext }));
×
1402
        }
1403
      }
1404
    }
1405

1406
    files.add_sample(std::move(map));
×
1407
  }
1408

1409
  return files;
×
1410
}
1411

1412
output_file
1413
userconfig::new_output_file(const std::string& key,
×
1414
                            const string_vec& values) const
1415
{
1416
  std::string out;
×
1417
  if (argparser.is_set(key)) {
×
1418
    if (!is_demultiplexing_enabled()) {
×
1419
      const auto filename = argparser.value(key);
×
1420

1421
      return { filename, infer_output_format(filename) };
×
1422
    }
1423

1424
    out = argparser.value(key);
×
1425
  } else if (out_prefix == DEV_NULL || key == "--out-discarded") {
×
1426
    return { DEV_NULL, output_format::fastq };
×
1427
  } else {
1428
    out = out_prefix;
×
1429
  }
1430

1431
  for (const auto& value : values) {
×
1432
    if (!value.empty() && value.front() != '.') {
×
1433
      out.push_back('.');
×
1434
    }
1435

1436
    out.append(value);
×
1437
  }
1438

1439
  return output_file{ out, infer_output_format(out) };
×
1440
}
1441

1442
bool
1443
check_and_set_barcode_mm(const argparse::parser& argparser,
×
1444
                         const std::string& key,
1445
                         unsigned barcode_mm,
1446
                         unsigned& dst)
1447
{
1448
  if (!argparser.is_set(key)) {
×
1449
    dst = barcode_mm;
×
1450
  } else if (dst > barcode_mm) {
×
1451
    log::error()
×
1452
      << "The maximum number of errors for " << key
×
1453
      << " is set \n"
1454
         "to a higher value than the total number of mismatches allowed\n"
1455
         "for barcodes (--barcode-mm). Please correct these settings.";
×
1456
    return false;
×
1457
  }
1458

1459
  return true;
1460
}
1461

1462
bool
1463
userconfig::is_adapter_trimming_enabled() const
×
1464
{
1465
  return run_type == ar_command::trim_adapters;
×
1466
}
1467

1468
bool
1469
userconfig::is_demultiplexing_enabled() const
×
1470
{
1471
  return !barcode_list.empty();
×
1472
}
1473

1474
bool
1475
userconfig::is_read_merging_enabled() const
×
1476
{
1477
  return is_adapter_trimming_enabled() && merge != merge_strategy::none;
×
1478
}
1479

1480
bool
1481
userconfig::is_any_quality_trimming_enabled() const
×
1482
{
1483
  return is_adapter_trimming_enabled() &&
×
1484
         (is_low_quality_trimming_enabled() ||
×
1485
          is_terminal_base_pre_trimming_enabled() ||
×
1486
          is_terminal_base_post_trimming_enabled() ||
×
1487
          is_poly_x_tail_pre_trimming_enabled() ||
×
1488
          is_poly_x_tail_post_trimming_enabled());
×
1489
}
1490

1491
bool
1492
userconfig::is_low_quality_trimming_enabled() const
×
1493
{
1494
  return trim != trimming_strategy::none;
×
1495
}
1496

1497
bool
1498
userconfig::is_terminal_base_pre_trimming_enabled() const
×
1499
{
1500
  return
×
1501
#ifdef PRE_TRIM_5P
1502
    pre_trim_fixed_5p.first || pre_trim_fixed_5p.second ||
1503
#endif
1504
    pre_trim_fixed_3p.first || pre_trim_fixed_3p.second;
×
1505
}
1506

1507
bool
1508
userconfig::is_terminal_base_post_trimming_enabled() const
×
1509
{
1510
  return post_trim_fixed_5p.first || post_trim_fixed_5p.second ||
×
1511
         post_trim_fixed_3p.first || post_trim_fixed_3p.second;
×
1512
}
1513

1514
bool
1515
userconfig::is_poly_x_tail_pre_trimming_enabled() const
×
1516
{
1517
  return !pre_trim_poly_x.empty();
×
1518
}
1519

1520
bool
1521
userconfig::is_poly_x_tail_post_trimming_enabled() const
×
1522
{
1523
  return !post_trim_poly_x.empty();
×
1524
}
1525

1526
bool
1527
userconfig::is_any_filtering_enabled() const
×
1528
{
1529
  return is_adapter_trimming_enabled() &&
×
1530
         (is_short_read_filtering_enabled() ||
×
1531
          is_long_read_filtering_enabled() ||
×
1532
          is_ambiguous_base_filtering_enabled() ||
×
1533
          is_mean_quality_filtering_enabled() ||
×
1534
          is_low_complexity_filtering_enabled());
×
1535
}
1536

1537
bool
1538
userconfig::is_short_read_filtering_enabled() const
×
1539
{
1540
  return min_genomic_length > 0;
×
1541
}
1542

1543
bool
1544
userconfig::is_long_read_filtering_enabled() const
×
1545
{
1546
  return max_genomic_length != std::numeric_limits<unsigned>::max();
×
1547
}
1548

1549
bool
1550
userconfig::is_ambiguous_base_filtering_enabled() const
×
1551
{
1552
  return max_ambiguous_bases != std::numeric_limits<unsigned>::max();
×
1553
}
1554

1555
bool
1556
userconfig::is_mean_quality_filtering_enabled() const
×
1557
{
1558
  return min_mean_quality > 0;
×
1559
}
1560

1561
bool
1562
userconfig::is_low_complexity_filtering_enabled() const
×
1563
{
1564
  return min_complexity > 0;
×
1565
}
1566

1567
bool
1568
userconfig::setup_adapter_sequences()
×
1569
{
1570
  adapter_set adapters;
×
1571
  if (argparser.is_set("--adapter-list")) {
×
1572
    try {
×
1573
      adapters.load(adapter_list, paired_ended_mode);
×
1574
    } catch (const std::exception& error) {
×
1575
      log::error() << "Error reading adapters from " << log_escape(adapter_list)
×
1576
                   << ": " << error.what();
×
1577
      return false;
×
1578
    }
×
1579

1580
    if (adapters.size()) {
×
1581
      log::info() << "Read " << adapters.size()
×
1582
                  << " adapters / adapter pairs from '" << adapter_list << "'";
×
1583
    } else {
1584
      log::error() << "No adapter sequences found in table!";
×
1585
      return false;
×
1586
    }
1587
  } else {
1588
    try {
×
1589
      adapters.add(adapter_1, adapter_2);
×
1590
    } catch (const fastq_error& error) {
×
1591
      log::error() << "Error parsing adapter sequence(s):\n"
×
1592
                   << "   " << error.what();
×
1593

1594
      return false;
×
1595
    }
×
1596
  }
1597

1598
  samples.set_adapters(std::move(adapters));
×
1599

1600
  return true;
×
1601
}
1602

1603
bool
1604
userconfig::setup_demultiplexing()
×
1605
{
1606
  if (!argparser.is_set("--barcode-mm")) {
×
1607
    barcode_mm = barcode_mm_r1 + barcode_mm_r2;
×
1608
  }
1609

1610
  if (!check_and_set_barcode_mm(
×
1611
        argparser, "--barcode-mm-r1", barcode_mm, barcode_mm_r1)) {
×
1612
    return false;
1613
  }
1614

1615
  if (!check_and_set_barcode_mm(
×
1616
        argparser, "--barcode-mm-r2", barcode_mm, barcode_mm_r2)) {
×
1617
    return false;
1618
  }
1619

1620
  if (argparser.is_set("--barcode-list")) {
×
1621
    const auto config =
×
1622
      barcode_config()
×
1623
        .paired_end_mode(paired_ended_mode)
×
1624
        .allow_multiple_barcodes(argparser.is_set("--multiple-barcodes"))
×
1625
        .unidirectional_barcodes(!argparser.is_set("--reversible-barcodes"));
×
1626

1627
    try {
×
1628
      samples.load(barcode_list, config);
×
1629
    } catch (const std::exception& error) {
×
1630
      log::error() << "Error reading barcodes from " << log_escape(barcode_list)
×
1631
                   << ": " << error.what();
×
1632
      return false;
×
1633
    }
×
1634

1635
    if (samples.size()) {
×
1636
      log::info() << "Read " << samples.size() << " sets of barcodes from "
×
1637
                  << shell_escape(barcode_list);
×
1638
    } else {
1639
      log::error() << "No barcodes sequences found in table!";
×
1640
      return false;
×
1641
    }
1642
  }
1643

1644
  const auto& output_files = get_output_filenames();
×
1645

1646
  return check_input_and_output("--in-file1", input_files_1, output_files) &&
×
1647
         check_input_and_output("--in-file2", input_files_2, output_files);
×
1648
}
1649

1650
} // namespace adapterremoval
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc