• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #117

25 May 2025 03:01PM UTC coverage: 66.932% (-0.07%) from 67.006%
#117

push

travis-ci

web-flow
iwyu and reduce build-time inter-dependencies (#144)

26 of 145 new or added lines in 20 files covered. (17.93%)

89 existing lines in 5 files now uncovered.

9738 of 14549 relevant lines covered (66.93%)

3041.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/userconfig.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
3
// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
4
#include "userconfig.hpp"    // declarations
5
#include "alignment.hpp"     // for alignment_info
6
#include "argparse.hpp"      // for parser, parse_result
7
#include "commontypes.hpp"   // for string_vec, DEV_STDOUT, DEV_STDERR, ...
8
#include "debug.hpp"         // for AR_REQUIRE, AR_FAIL
9
#include "errors.hpp"        // for fastq_error
10
#include "fastq_enc.hpp"     // for PHRED_SCORE_MAX
11
#include "licenses.hpp"      // for LICENSES
12
#include "logging.hpp"       // for log_stream, error, set_level, set_colors, ..
13
#include "main.hpp"          // for HELPTEXT, NAME, VERSION
14
#include "output.hpp"        // for DEV_NULL, output_files, output_file
15
#include "progress.hpp"      // for progress_type, progress_type::simple, ...
16
#include "sequence.hpp"      // for dna_sequence
17
#include "sequence_sets.hpp" // for sample_set
18
#include "simd.hpp"          // for size_t, name, supported, instruction_set
19
#include "strutils.hpp"      // for shell_escape, str_to_u32
20
#include <algorithm>         // for find, max, min
21
#include <array>             // for array
22
#include <cerrno>            // for errno
23
#include <cstdlib>           // for getenv
24
#include <cstring>           // for size_t, strerror, strcmp
25
#include <exception>         // for exception
26
#include <filesystem>        // for weakly_canonical
27
#include <limits>            // for numeric_limits
28
#include <memory>            // for unique_ptr, make_unique
29
#include <stdexcept>         // for invalid_argument
30
#include <string>            // for string, basic_string, operator==, operator+
31
#include <string_view>       // for string_view
32
#include <tuple>             // for get, tuple
33
#include <unistd.h>          // for access, isatty, R_OK, STDERR_FILENO
34

35
namespace adapterremoval {
36

37
namespace {
38

39
const std::string_view HELPTEXT =
40
  "AdapterRemoval searches for and removes remnant adapter sequences, poly-X "
41
  "tails and low-quality base from FASTQ reads. See `man adapterremoval3` or "
42
  "https://adapterremoval.readthedocs.io/ for more information\n"
43
  "\n"
44
  "For comments, suggestions, or other feedback please use\n"
45
  "  https://github.com/MikkelSchubert/adapterremoval/issues/new\n"
46
  "\n"
47
  "If you use this program, then please cite Schubert et al. 2016:\n"
48
  "  https://doi.org/10.1186/s13104-016-1900-2\n";
49

50
////////////////////////////////////////////////////////////////////////////////
51
// Helper functions
52

53
std::pair<unsigned, unsigned>
54
parse_trim_argument(const string_vec& values)
×
55
{
56
  unsigned mate_1 = 0;
×
57
  unsigned mate_2 = 0;
×
58

59
  switch (values.size()) {
×
60
    case 1:
×
61
      mate_1 = str_to_u32(values.front());
×
62
      mate_2 = mate_1;
×
63
      break;
×
64

65
    case 2:
×
66
      mate_1 = str_to_u32(values.front());
×
67
      mate_2 = str_to_u32(values.back());
×
68
      break;
×
69

70
    default:
×
71
      throw std::invalid_argument("please specify exactly one or two values");
×
72
  }
73

74
  return { mate_1, mate_2 };
×
75
}
76

77
bool
78
parse_poly_x_option(const std::string& key,
×
79
                    const string_vec& values,
80
                    std::string& out)
81
{
82
  out.clear();
×
83
  if (values.empty()) {
×
84
    out = "ACGT";
×
85
    return true;
86
  }
87

88
  std::array<bool, ACGT::indices> enabled = {};
×
89
  for (const auto& value : values) {
×
90
    for (const auto nuc : to_upper(value)) {
×
91
      switch (nuc) {
×
92
        case 'A':
×
93
        case 'C':
×
94
        case 'G':
×
95
        case 'T':
×
96
          enabled.at(ACGT::to_index(nuc)) = true;
×
97
          break;
×
98

99
        default:
×
100
          log::error() << "Option " << key << " called with invalid value "
×
101
                       << shell_escape(value) << ". Only A, C, G, and T are "
×
102
                       << "permitted!";
×
103

104
          return false;
×
105
      }
106
    }
107
  }
108

109
  for (const auto nuc : ACGT::values) {
×
110
    if (enabled.at(ACGT::to_index(nuc))) {
×
111
      out.push_back(nuc);
×
112
    }
113
  }
114

115
  return true;
116
}
117

118
bool
119
parse_counts(const argparse::parser& args,
×
120
             const std::string& key,
121
             uint64_t& out)
122
{
123
  const auto sink = args.value(std::string{ key });
×
124
  if (sink.empty()) {
×
125
    return true;
126
  }
127

128
  uint64_t unit = 1;
×
129
  std::string sink_without_unit = sink;
×
130
  if (sink.back() < '0' || sink.back() > '9') {
×
131
    switch (sink.back()) {
×
132
      case 'k':
133
      case 'K':
134
        unit = 1000;
135
        break;
136

137
      case 'm':
×
138
      case 'M':
×
139
        unit = 1000'000;
×
140
        break;
×
141

142
      case 'g':
×
143
      case 'G':
×
144
        unit = 1000'000'000;
×
145
        break;
×
146

147
      default:
×
148
        log::error() << "Invalid unit in command-line option " << key
×
149
                     << shell_escape(sink);
×
150
        return false;
×
151
    }
152

153
    sink_without_unit.pop_back();
×
154
  }
155

156
  try {
×
157
    // This should not be able to overflow as log2(2^32 * 1e9) ~= 62,
158
    // but will need to be changed if we want to allow large raw numbers
159
    out = static_cast<uint64_t>(str_to_u32(sink_without_unit)) * unit;
×
160
  } catch (const std::invalid_argument&) {
×
161
    log::error() << "Invalid value in command-line option --sink "
×
162
                 << shell_escape(sink);
×
163
    return false;
×
164
  }
×
165

166
  return true;
×
167
}
168

169
bool
170
check_no_clobber(const std::string& label,
×
171
                 const string_vec& in_files,
172
                 const output_file& out_file)
173
{
174
  for (const auto& in_file : in_files) {
×
175
    if (in_file == out_file.name && in_file != DEV_NULL) {
×
176
      log::error() << "Input file would be overwritten: " << label << " "
×
177
                   << in_file;
×
178
      return false;
×
179
    }
180
  }
181

182
  return true;
×
183
}
184

185
/** Replace the STDIN pseudo-filename with the device path */
186
void
187
normalize_input_file(std::string& filename)
×
188
{
189
  if (filename == DEV_PIPE) {
×
190
    filename = DEV_STDIN;
×
191
  }
192
}
193

194
/** Replace the STDIN pseudo-filename with the device path */
195
void
196
normalize_output_file(std::string& filename)
×
197
{
198
  if (filename == DEV_PIPE) {
×
199
    filename = DEV_STDOUT;
×
200
  }
201
}
202

203
void
204
append_normalized_input_files(string_pair_vec& out, const string_vec& filenames)
×
205
{
206
  for (const auto& filename : filenames) {
×
207
    try {
×
208
      out.emplace_back(std::filesystem::weakly_canonical(filename), filename);
×
209
    } catch (const std::filesystem::filesystem_error&) {
×
210
      // Permission errors are handled by the explicit access checks below
211
      out.emplace_back(filename, filename);
×
212
    }
×
213
  }
214
}
215

216
bool
217
check_input_files(const string_vec& filenames_1, const string_vec& filenames_2)
×
218
{
219
  string_pair_vec filenames;
×
220
  append_normalized_input_files(filenames, filenames_1);
×
221
  append_normalized_input_files(filenames, filenames_2);
×
222
  std::sort(filenames.begin(), filenames.end());
×
223

224
  bool any_errors = false;
225
  for (size_t i = 1; i < filenames.size(); ++i) {
×
226
    const auto& it_0 = filenames.at(i - 1);
×
227
    const auto& it_1 = filenames.at(i);
×
228

229
    if (it_0.second == it_1.second) {
×
230
      log::error() << "Input file " << log_escape(it_0.second)
×
231
                   << " has been specified multiple times using --in-file1 "
232
                      "and/or --in-file2";
×
233
      any_errors = true;
×
234
    } else if (it_0.first == it_1.first) {
×
235
      log::error() << "The path of input file " << log_escape(it_0.second)
×
236
                   << " and the path of input " << "file "
×
237
                   << log_escape(it_1.second) << " both point to the file "
×
238
                   << log_escape(it_0.first);
×
239
      any_errors = true;
×
240
    }
241
  }
242

243
  for (const auto& it : filenames) {
×
244
    if (access(it.second.c_str(), R_OK)) {
×
245
      log::error() << "Cannot access input file " << log_escape(it.second)
×
246
                   << ": " << std::strerror(errno);
×
247
      any_errors = true;
×
248
    }
249
  }
250

251
  return !any_errors;
×
252
}
253

254
bool
255
check_output_files(const std::string& label,
×
256
                   const string_vec& filenames,
257
                   const output_files& output_files)
258
{
259

260
  if (!check_no_clobber(label, filenames, output_files.unidentified_1)) {
×
261
    return false;
262
  }
263

264
  if (!check_no_clobber(label, filenames, output_files.unidentified_2)) {
×
265
    return false;
266
  }
267

268
  for (const auto& sample : output_files.samples()) {
×
269
    for (size_t i = 0; i < sample.size(); ++i) {
×
270
      if (!check_no_clobber(label, filenames, sample.file(i))) {
×
271
        return false;
×
272
      }
273
    }
274
  }
275

276
  return true;
×
277
}
278

279
/**
280
 * Tries to parse a simple command-line argument while ignoring the validity
281
 * of the overall command-line. This is only intended to make pre-configured
282
 * logging output consistent with post-configured output if possible.
283
 */
284
std::string
285
try_parse_argument(const string_vec& args,
×
286
                   const std::string& key,
287
                   const std::string& fallback)
288
{
289
  auto it = std::find(args.begin(), args.end(), key);
×
290
  if (it != args.end() && (it + 1) != args.end()) {
×
291
    return *(it + 1);
×
292
  }
293

294
  return fallback;
×
295
}
296

297
/** Returns vector of keys for output files that have been set by the user. */
298
string_vec
299
user_supplied_keys(const argparse::parser& argparser, const string_vec& keys)
×
300
{
301
  string_vec result;
×
302
  for (const auto& key : keys) {
×
303
    if (argparser.is_set(key)) {
×
304
      result.push_back(key);
×
305
    }
306
  }
307

308
  return result;
×
309
}
×
310

311
////////////////////////////////////////////////////////////////////////////////
312

313
bool
314
fancy_output_allowed()
×
315
{
316
  if (::isatty(STDERR_FILENO)) {
×
317
    // NO_COLOR is checked as suggested by https://no-color.org/
318
    const char* no_color = std::getenv("NO_COLOR");
×
319
    const char* term = std::getenv("TERM");
×
320

321
    return !(no_color && no_color[0] != '\0') &&
×
322
           !(term && strcmp(term, "dumb") == 0);
×
323
  }
324

325
  return false;
326
}
327

328
void
329
configure_log_levels(const std::string& value, bool fallible = false)
×
330
{
331
  const auto log_level = to_lower(value);
×
332

333
  if (log_level == "debug") {
×
334
    log::set_level(log::level::debug);
×
335
  } else if (log_level == "info") {
×
336
    log::set_level(log::level::info);
×
337
  } else if (log_level == "warning") {
×
338
    log::set_level(log::level::warning);
×
339
  } else if (log_level == "error") {
×
340
    log::set_level(log::level::error);
×
341
  } else {
342
    AR_REQUIRE(fallible, "unhandled log_level value");
×
343
  }
344
}
345

346
void
347
configure_log_colors(const std::string& colors, bool fallible = false)
×
348
{
349
  if (colors == "always") {
×
350
    log::set_colors(true);
×
351
  } else if (colors == "never") {
×
352
    log::set_colors(false);
×
353
  } else if (colors == "auto") {
×
354
    log::set_colors(fancy_output_allowed());
×
355
  } else {
356
    AR_REQUIRE(fallible, "unhandled log_colors value");
×
357
  }
358
}
359

360
progress_type
361
configure_log_progress(const std::string& progress)
×
362
{
363
  if (progress == "never") {
×
364
    return progress_type::none;
365
  } else if (progress == "spin") {
×
366
    return progress_type::spinner;
367
  } else if (progress == "log") {
×
368
    return progress_type::simple;
369
  } else if (progress == "auto") {
×
370
    if (fancy_output_allowed()) {
×
371
      return progress_type::spinner;
372
    } else {
373
      return progress_type::simple;
374
    }
375
  }
376

377
  AR_FAIL("unhandled log_progress value");
×
378
}
379

380
fastq_encoding
381
configure_encoding(const std::string& value,
×
382
                   degenerate_encoding degenerate,
383
                   uracil_encoding uracils)
384
{
385
  if (value == "33") {
×
386
    return fastq_encoding{ quality_encoding::phred_33, degenerate, uracils };
×
387
  } else if (value == "64") {
×
388
    return fastq_encoding{ quality_encoding::phred_64, degenerate, uracils };
×
389
  } else if (value == "solexa") {
×
390
    return fastq_encoding{ quality_encoding::solexa, degenerate, uracils };
×
391
  } else if (value == "sam") {
×
392
    return fastq_encoding{ quality_encoding::sam, degenerate, uracils };
×
393
  }
394

395
  AR_FAIL("unhandled qualitybase value");
×
396
}
397

398
bool
399
parse_output_formats(const argparse::parser& argparser,
×
400
                     output_format& file_format,
401
                     output_format& stdout_format)
402
{
403
  if (argparser.is_set("--gzip")) {
×
404
    file_format = stdout_format = output_format::fastq_gzip;
×
405
    return true;
×
406
  }
407

408
  auto format_s = argparser.value("--out-format");
×
409
  if (!output_files::parse_format(format_s, file_format)) {
×
410
    log::error() << "Invalid output format " + log_escape(format_s);
×
411
    return false;
×
412
  }
413

414
  // Default to writing uncompressed output to STDOUT
415
  if (!argparser.is_set("--stdout-format")) {
×
416
    switch (file_format) {
×
417
      case output_format::fastq:
×
418
      case output_format::fastq_gzip:
×
419
        stdout_format = output_format::fastq;
×
420
        return true;
×
421
      case output_format::sam:
×
422
      case output_format::sam_gzip:
×
423
        stdout_format = output_format::sam;
×
424
        return true;
×
425
      case output_format::bam:
×
426
      case output_format::ubam:
×
427
        stdout_format = output_format::ubam;
×
428
        return true;
×
429
      default:
×
430
        AR_FAIL("invalid output format");
×
431
    }
432
  }
433

434
  format_s = argparser.value("--stdout-format");
×
435
  if (!output_files::parse_format(format_s, stdout_format)) {
×
436
    log::error() << "Invalid output format " + log_escape(format_s);
×
437
    return false;
×
438
  }
439

440
  return true;
441
}
442

443
} // namespace
444

445
////////////////////////////////////////////////////////////////////////////////
446
// Implementations for `userconfig`
447

448
std::string userconfig::start_time = timestamp("%FT%T%z");
449

450
userconfig::userconfig()
×
NEW
451
  : samples(std::make_unique<sample_set>())
×
NEW
452
  , m_argparser(std::make_unique<argparse::parser>())
×
453
{
NEW
454
  auto& argparser = *m_argparser;
×
455
  argparser.set_name(NAME);
×
456
  argparser.set_version(VERSION);
×
457
  argparser.set_preamble(HELPTEXT);
×
458
  argparser.set_licenses(LICENSES);
×
459
  argparser.set_terminal_width(log::get_terminal_width());
×
460

461
  //////////////////////////////////////////////////////////////////////////////
462
  argparser.add("--threads", "N")
×
463
    .help("Maximum number of threads")
×
464
    .bind_u32(&max_threads)
×
465
    .with_default(2)
×
466
    .with_minimum(1);
×
467

468
  {
×
469
    std::vector<std::string> choices;
×
470
    for (const auto is : simd::supported()) {
×
471
      choices.emplace_back(simd::name(is));
×
472
    }
473

474
    AR_REQUIRE(!choices.empty());
×
475
    argparser.add("--simd", "NAME")
×
476
      .help("SIMD instruction set to use; defaults to the most advanced "
×
477
            "instruction set supported by this computer")
478
      .bind_str(nullptr)
×
479
      .with_choices(choices)
×
480
      .with_default(choices.back());
×
481
  }
482

483
  argparser.add("--benchmark")
×
484
    .help("Carry out benchmarking of AdapterRemoval sub-systems")
×
485
    .conflicts_with("--demultiplex-only")
×
486
    .conflicts_with("--report-only")
×
487
    .conflicts_with("--interleaved")
×
488
    .conflicts_with("--interleaved-input")
×
489
#if !defined(DEBUG)
490
    .hidden()
×
491
#endif
492
    .bind_vec(&benchmarks)
×
493
    .with_min_values(0);
×
494

495
  //////////////////////////////////////////////////////////////////////////////
496
  argparser.add_header("INPUT FILES:");
×
497

498
  argparser.add("--in-file1", "FILE")
×
499
    .help("One or more input files containing mate 1 reads [REQUIRED]")
×
500
    .deprecated_alias("--file1")
×
501
    .bind_vec(&input_files_1)
×
502
    .with_preprocessor(normalize_input_file);
×
503
  argparser.add("--in-file2", "FILE")
×
504
    .help("Input files containing mate 2 reads; if used, then the same number "
×
505
          "of files as --in-file1 must be listed [OPTIONAL]")
506
    .deprecated_alias("--file2")
×
507
    .bind_vec(&input_files_2)
×
508
    .with_preprocessor(normalize_input_file);
×
509
  argparser.add("--head", "N")
×
510
    .help("Process only the first N reads in single-end mode or the first N "
×
511
          "read-pairs in paired-end mode. Accepts suffixes K (thousands), M "
512
          "(millions), and G (billions) [default: all reads]")
513
    .bind_str(nullptr);
×
514

515
  //////////////////////////////////////////////////////////////////////////////
516
  argparser.add_header("OUTPUT FILES:");
×
517

518
  argparser.add("--out-prefix", "PREFIX")
×
519
    .help("Prefix for output files for which the corresponding --out option "
×
520
          "was not set [default: not set]")
521
    .deprecated_alias("--basename")
×
522
    .bind_str(&out_prefix)
×
523
    .with_default(DEV_NULL);
×
524

525
  argparser.add_separator();
×
526
  argparser.add("--out-file1", "FILE")
×
527
    .help("Output file containing trimmed mate 1 reads. Setting this value in "
×
528
          "in demultiplexing mode overrides --out-prefix for this file")
529
    .deprecated_alias("--output1")
×
530
    .bind_str(nullptr)
×
531
    .with_default("{prefix}[.sample].r1.fastq")
×
532
    .with_preprocessor(normalize_output_file);
×
533
  argparser.add("--out-file2", "FILE")
×
534
    .help("Output file containing trimmed mate 2 reads. Setting this value in "
×
535
          "in demultiplexing mode overrides --out-prefix for this file")
536
    .deprecated_alias("--output2")
×
537
    .bind_str(nullptr)
×
538
    .with_default("{prefix}[.sample].r2.fastq")
×
539
    .with_preprocessor(normalize_output_file);
×
540
  argparser.add("--out-merged", "FILE")
×
541
    .help("Output file that, if --merge is set, contains overlapping "
×
542
          "read-pairs that have been merged into a single read (PE mode only). "
543
          "Setting this value in demultiplexing mode overrides --out-prefix "
544
          "for this file")
545
    .deprecated_alias("--outputcollapsed")
×
546
    .bind_str(nullptr)
×
547
    .with_default("{prefix}[.sample].merged.fastq")
×
548
    .with_preprocessor(normalize_output_file);
×
549
  argparser.add("--out-singleton", "FILE")
×
550
    .help("Output file containing paired reads for which the mate "
×
551
          "has been discarded. This file is only created if filtering is "
552
          "enabled. Setting this value in demultiplexing mode overrides "
553
          "--out-prefix for this file")
554
    .deprecated_alias("--singleton")
×
555
    .bind_str(nullptr)
×
556
    .with_default("{prefix}[.sample].singleton.fastq")
×
557
    .with_preprocessor(normalize_output_file);
×
558

559
  argparser.add_separator();
×
560
  argparser.add("--out-unidentified1", "FILE")
×
561
    .help("In demultiplexing mode, contains mate 1 reads that could not be "
×
562
          "assigned to a single sample")
563
    .bind_str(nullptr)
×
564
    .with_default("{prefix}.unidentified.r1.fastq")
×
565
    .with_preprocessor(normalize_output_file);
×
566
  argparser.add("--out-unidentified2", "FILE")
×
567
    .help("In demultiplexing mode, contains mate 2 reads that could not be "
×
568
          "assigned to a single sample")
569
    .bind_str(nullptr)
×
570
    .with_default("{prefix}.unidentified.r2.fastq")
×
571
    .with_preprocessor(normalize_output_file);
×
572
  argparser.add("--out-discarded", "FILE")
×
573
    .help("Output file containing filtered reads. Setting this value in "
×
574
          "demultiplexing mode overrides --out-prefix for this file [default: "
575
          "not saved]")
576
    .deprecated_alias("--discarded")
×
577
    .bind_str(nullptr)
×
578
    .with_preprocessor(normalize_output_file);
×
579

580
  argparser.add_separator();
×
581
  argparser.add("--out-json", "FILE")
×
582
    .help("Output file containing statistics about input files, trimming, "
×
583
          "merging, and more in JSON format")
584
    .bind_str(nullptr)
×
585
    .with_default("{prefix}.json")
×
586
    .with_preprocessor(normalize_output_file);
×
587
  argparser.add("--out-html", "FILE")
×
588
    .help("Output file containing statistics about input files, trimming, "
×
589
          "merging, and more in HTML format")
590
    .bind_str(nullptr)
×
591
    .with_default("{prefix}.html")
×
592
    .with_preprocessor(normalize_output_file);
×
593

594
  //////////////////////////////////////////////////////////////////////////////
595
  argparser.add_header("FASTQ OPTIONS:");
×
596

597
  argparser.add("--quality-format", "N")
×
598
    .help("Format used to encode Phred scores in input")
×
599
    .deprecated_alias("--qualitybase")
×
600
    .bind_str(&quality_input_base)
×
601
    .with_choices({ "33", "64", "solexa", "sam" })
×
602
    .with_default("33");
×
603
  argparser.add("--mate-separator", "CHAR")
×
604
    .help("Character separating the mate number (1 or 2) from the read name in "
×
605
          "FASTQ records. Will be determined automatically if not specified")
606
    .bind_str(&mate_separator_str);
×
607

608
  argparser.add("--interleaved-input")
×
609
    .help("The (single) input file provided contains both the mate 1 and mate "
×
610
          "2 reads, one pair after the other, with one mate 1 reads followed "
611
          "by one mate 2 read. This option is implied by the --interleaved "
612
          "option")
613
    .conflicts_with("--in-file2")
×
614
    .bind_bool(&interleaved_input);
×
615
  argparser.add("--interleaved-output")
×
616
    .help("If set, trimmed paired-end reads are written to a single file "
×
617
          "containing mate 1 and mate 2 reads, one pair after the other. This "
618
          "option is implied by the --interleaved option")
619
    .conflicts_with("--out-file2")
×
620
    .bind_bool(&interleaved_output);
×
621
  argparser.add("--interleaved")
×
622
    .help("This option enables both the --interleaved-input option and the "
×
623
          "--interleaved-output option")
624
    .conflicts_with("--in-file2")
×
625
    .conflicts_with("--out-file2")
×
626
    .bind_bool(&interleaved);
×
627

628
  argparser.add("--mask-degenerate-bases")
×
629
    .help("Mask degenerate/ambiguous bases (B/D/H/K/M/N/R/S/V/W/Y) in the "
×
630
          "input by replacing them with an 'N'; if this option is not used, "
631
          "AdapterRemoval will abort upon encountering degenerate bases");
632
  argparser.add("--convert-uracils")
×
633
    .help("Convert uracils (U) to thymine (T) in input reads; if this option "
×
634
          "is not used, AdapterRemoval will abort upon encountering uracils");
635

636
  //////////////////////////////////////////////////////////////////////////////
637
  argparser.add_header("OUTPUT FORMAT:");
×
638

639
  argparser.add("--gzip")
×
640
    .hidden()
×
641
    .deprecated()
×
642
    .conflicts_with("--out-format")
×
643
    .conflicts_with("--stdout-format");
×
644
  argparser.add("--out-format", "X")
×
645
    .help("Selects the default output format; either 'fastq' for uncompressed "
×
646
          "FASTQ reads, 'fastq.gz' for gzip compressed FASTQ reads, 'sam' for "
647
          "uncompressed SAM records, 'sam.gz' for gzip compressed SAM records, "
648
          "'bam' for BGZF compressed BAM records, and 'ubam' for uncompressed "
649
          "BAM records. Setting an `--out-*` option overrides this option "
650
          "based on the filename used (except .ubam)")
651
    .bind_str(nullptr)
×
652
    .with_choices({ "fastq", "fastq.gz", "sam", "sam.gz", "bam", "ubam" })
×
653
    .with_default("fastq.gz");
×
654
  argparser.add("--stdout-format", "X")
×
655
    .help("Selects the output format for data written to STDOUT; choices are "
×
656
          "the same as for --out-format [default: the same format as "
657
          "--out-format, but uncompressed]")
658
    .bind_str(nullptr)
×
659
    .with_choices({ "fastq", "fastq.gz", "sam", "sam.gz", "bam", "ubam" });
×
660
  argparser.add("--read-group", "RG")
×
661
    .help("Add read-group to SAM/BAM output. Takes zero or more arguments in "
×
662
          "the form 'tag:value' where tag consists of two alphanumerical "
663
          "characters and where value is one or more characters. An argument "
664
          "may also contain multiple, tab-separated tag/value pairs. An ID tag "
665
          "is automatically generated if no ID tag is specified")
666
    .bind_vec(&read_group)
×
667
    .with_min_values(0);
×
668
  argparser.add("--compression-level", "N")
×
669
    .help(
×
670
      "Sets the compression level for compressed output. Valid values are 0 to "
671
      "13: Level 0 is uncompressed but includes gzip headers/checksums, level "
672
      "1 is streamed for SAM/FASTQ output (this may be required in rare cases "
673
      "for compatibility), and levels 2 to 13 are block compressed using the "
674
      "BGZF format")
675
    .deprecated_alias("--gzip-level")
×
676
    .bind_u32(&compression_level)
×
677
    .with_maximum(13)
×
678
    .with_default(5);
×
679

680
  //////////////////////////////////////////////////////////////////////////////
681
  argparser.add_header("PROCESSING:");
×
682

683
  argparser.add("--adapter1", "SEQ")
×
684
    .help("Adapter sequence expected to be found in mate 1 reads. Any 'N' in "
×
685
          "this sequence is treated as a wildcard")
686
    .bind_str(&adapter_1)
×
687
    .with_default("AGATCGGAAGAGCACACGTCTGAACTCCAGTCA");
×
688
  argparser.add("--adapter2", "SEQ")
×
689
    .help("Adapter sequence expected to be found in mate 2 reads. Any 'N' in "
×
690
          "this sequence is treated as a wildcard")
691
    .bind_str(&adapter_2)
×
692
    .with_default("AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT");
×
693
  argparser.add("--adapter-list", "FILE")
×
694
    .help("Read adapter pairs from the first two columns of a white-space "
×
695
          "separated table. AdapterRemoval will then select the best matching "
696
          "adapter pair for each pair of input reads when trimming. Only the "
697
          "first column is required for single-end trimming")
698
    .conflicts_with("--adapter1")
×
699
    .conflicts_with("--adapter2")
×
700
    .bind_str(&adapter_list);
×
701

702
  argparser.add_separator();
×
703
  argparser.add("--min-adapter-overlap", "N")
×
704
    .help("In single-end mode, reads are only trimmed if the overlap between "
×
705
          "read and the adapter is at least X bases long, not counting "
706
          "ambiguous nucleotides (Ns)")
707
    .deprecated_alias("--minadapteroverlap")
×
708
    .bind_u32(&min_adapter_overlap)
×
709
    .with_default(1)
×
710
    .with_minimum(1);
×
711
  argparser.add("--mismatch-rate", "X")
×
712
    .help("Max error-rate when aligning reads and/or adapters. If > 1, the max "
×
713
          "error-rate is set to 1 / X; if < 0, the defaults are used, "
714
          "otherwise the user-supplied value is used directly [default: 1/6 "
715
          "for trimming; 1/10 when identifying adapters]")
716
    .deprecated_alias("--mm")
×
717
    .bind_double(&mismatch_threshold)
×
718
    .with_default(-1.0);
×
719
  argparser.add("--shift", "N")
×
720
    .help("Consider alignments where up to N nucleotides are missing from the "
×
721
          "5' termini")
722
    .bind_u32(&shift)
×
723
    .with_default(2);
×
724

725
  argparser.add_separator();
×
726
  argparser.add("--merge")
×
727
    .help("When set, paired ended read alignments of --merge-threshold or "
×
728
          "more bases are merged into a single consensus sequence. Merged "
729
          "reads are written to prefix.merged by default. Has no effect "
730
          "in single-end mode")
731
    .deprecated_alias("--collapse");
×
732
  argparser.add("--merge-threshold", "N")
×
733
    .help("Paired reads must overlap at least this many bases to be considered "
×
734
          "overlapping for the purpose of read merging. Overlapping bases "
735
          "where one or both bases are ambiguous (N) are not counted")
736
    .deprecated_alias("--minalignmentlength")
×
737
    .bind_u32(&merge_threshold)
×
738
    .with_default(11);
×
739
  argparser.add("--merge-strategy", "X")
×
740
    .help(
×
741
      "The 'maximum' strategy uses Q=max(Q1,Q2) for matches while the "
742
      "'additive' strategy uses Q=Q1+Q2. Both strategies use Q=abs(Q1-Q2) for "
743
      "mismatches and picks the highest quality base, unless the qualities are "
744
      "the same in which case 'N' is used. Setting this option implies --merge")
745
    .bind_str(nullptr)
×
746
    .with_choices({ "maximum", "additive" })
×
747
    .with_default("maximum");
×
748
  argparser.add("--merge-quality-max", "N")
×
749
    .help("Sets the maximum Phred score for re-calculated quality scores when "
×
750
          "read merging is enabled with the 'additive' merging strategy. The "
751
          "value must be in the range 0 to 93, corresponding to Phred+33 "
752
          "encoded values of '!' to '~'")
753
    .deprecated_alias("--qualitymax")
×
754
    .bind_u32(&merge_quality_max)
×
755
    .with_maximum(PHRED_SCORE_MAX)
×
756
    .with_default(41);
×
757
  argparser.add("--collapse-deterministic")
×
758
    .conflicts_with("--collapse-conservatively")
×
759
    .conflicts_with("--merge-strategy")
×
760
    .deprecated();
×
761
  argparser.add("--collapse-conservatively")
×
762
    .conflicts_with("--collapse-deterministic")
×
763
    .conflicts_with("--merge-strategy")
×
764
    .deprecated();
×
765

766
  argparser.add_separator();
×
767
  argparser.add("--prefix-read1", "X")
×
768
    .help("Adds the specified prefix to read 1 names [default: no prefix]")
×
769
    .bind_str(&prefix_read_1);
×
770
  argparser.add("--prefix-read2", "X")
×
771
    .help("Adds the specified prefix to read 2 names [default: no prefix]")
×
772
    .bind_str(&prefix_read_2);
×
773
  argparser.add("--prefix-merged", "X")
×
774
    .help("Adds the specified prefix to merged read names [default: no prefix]")
×
775
    .bind_str(&prefix_merged);
×
776

777
  //////////////////////////////////////////////////////////////////////////////
778
  argparser.add_header("QUALITY TRIMMING:");
×
779

780
#ifdef PRE_TRIM_5P
781
  argparser.add("--pre-trim5p", "N")
782
    .help("Trim the 5' of reads by a fixed amount after demultiplexing (if "
783
          "enabled) but before trimming adapters and low quality bases. "
784
          "Specify one value to trim mate 1 and mate 2 reads the same amount, "
785
          "or two values separated by a space to trim each mate a different "
786
          "amount [default: no trimming]")
787
    .bind_vec(&pre_trim5p)
788
    .with_max_values(2);
789
#endif
790
  argparser.add("--pre-trim3p", "N")
×
791
    .help("Trim the 3' of reads by a fixed amount after demultiplexing (if "
×
792
          "enabled) but before trimming adapters and low quality bases. "
793
          "Specify one value to trim mate 1 and mate 2 reads the same amount, "
794
          "or two values separated by a space to trim each mate a different "
795
          "amount [default: no trimming]")
796
    .bind_vec(&pre_trim3p)
×
797
    .with_max_values(2);
×
798

799
  argparser.add("--post-trim5p", "N")
×
800
    .help("Trim the 5' by a fixed amount after removing adapters, but before "
×
801
          "carrying out quality based trimming [default: no trimming]")
802
    .deprecated_alias("--trim5p")
×
803
    .bind_vec(&post_trim5p)
×
804
    .with_max_values(2);
×
805
  argparser.add("--post-trim3p", "N")
×
806
    .deprecated_alias("--trim3p")
×
807
    .help("Trim the 3' by a fixed amount after removing adapters, but before "
×
808
          "carrying out quality based trimming [default: no trimming]")
809
    .bind_vec(&post_trim3p)
×
810
    .with_max_values(2);
×
811

812
  argparser.add_separator();
×
813
  argparser.add("--quality-trimming", "method")
×
814
    .help("Strategy for trimming low quality bases: 'mott' for the modified "
×
815
          "Mott's algorithm; 'window' for window based trimming; 'per-base' "
816
          "for a per-base trimming of low quality base; and 'none' for no "
817
          "trimming of low quality bases")
818
    .deprecated_alias("--trim-strategy") // name used during v3 alpha 1
×
819
    .bind_str(nullptr)
×
820
    .with_choices({ "mott", "window", "per-base", "none" })
×
821
    .with_default("mott");
×
822

823
  argparser.add("--trim-mott-quality", "N")
×
824
    .help("The inclusive threshold value used when performing quality based "
×
825
          "trimming using the modified Mott's algorithm. The value must be in "
826
          "the range 0 to 93, corresponding to Phred+33 encoded values of '!' "
827
          "to '~'")
828
    .deprecated_alias("--trim-mott-rate")
×
829
    .conflicts_with("--trim-windows")
×
830
    .conflicts_with("--trim-ns")
×
831
    .conflicts_with("--trim-qualities")
×
832
    .conflicts_with("--trim-min-quality")
×
833
    .bind_double(&trim_mott_rate)
×
834
    .with_minimum(0)
×
835
    .with_maximum(93)
×
836
    .with_default(13);
×
837
  argparser.add("--trim-windows", "X")
×
838
    .help("Specifies the size of the window used for '--quality-trimming "
×
839
          "window': If >= 1, this value will be used as the window size; if "
840
          "the value is < 1, window size is the read length times this value. "
841
          "If the resulting window size is 0 or larger than the read length, "
842
          "the read length is used as the window size")
843
    .deprecated_alias("--trimwindows")
×
844
    .conflicts_with("--trim-mott-quality")
×
845
    .conflicts_with("--trim-qualities")
×
846
    .bind_double(&trim_window_length)
×
847
    .with_minimum(0.0)
×
848
    .with_default(0.1);
×
849
  argparser.add("--trim-min-quality", "N")
×
850
    .help("Inclusive minimum quality used when trimming low-quality bases with "
×
851
          "--quality-trimming options 'window' and 'per-base'. The value must "
852
          "be in the range 0 to 93, corresponding to Phred+33 encoded values "
853
          "of '!' to '~'")
854
    .deprecated_alias("--minquality")
×
855
    .conflicts_with("--trim-mott-quality")
×
856
    .bind_u32(&trim_quality_score)
×
857
    .with_maximum(PHRED_SCORE_MAX)
×
858
    .with_default(2);
×
859
  argparser.add("--trim-ns")
×
860
    .help("If set, trim ambiguous bases (N) at 5'/3' termini when using the "
×
861
          "'window' or the 'per-base' trimming strategy")
862
    .conflicts_with("--trim-mott-quality")
×
863
    .deprecated_alias("--trimns")
×
864
    .bind_bool(&trim_ambiguous_bases);
×
865
  argparser.add("--trim-qualities")
×
866
    .help("If set, trim low-quality bases (< --trim-min-quality) when using "
×
867
          "the 'per-base' trimming strategy")
868
    .deprecated_alias("--trimqualities")
×
869
    .conflicts_with("--trim-mott-quality")
×
870
    .conflicts_with("--trim-windows")
×
871
    .bind_bool(&trim_low_quality_bases);
×
872

873
  argparser.add_separator();
×
874
  argparser.add("--pre-trim-polyx", "X")
×
875
    .help("Enable trimming of poly-X tails prior to read alignment and adapter "
×
876
          "trimming. Zero or more nucleotides (A, C, G, T) may be specified. "
877
          "Zero or more nucleotides may be specified after the option "
878
          "separated by spaces, with zero nucleotides corresponding to all of "
879
          "A, C, G, and T")
880
    .bind_vec(&pre_trim_poly_x_sink)
×
881
    .with_min_values(0);
×
882
  argparser.add("--post-trim-polyx", "X")
×
883
    .help("Enable trimming of poly-X tails after read alignment and adapter "
×
884
          "trimming/merging, but before trimming of low-quality bases. Merged "
885
          "reads are not trimmed by this option (both ends are 5'). Zero or "
886
          "more nucleotides (A, C, G, T) may be specified. Zero or more "
887
          "nucleotides may be specified after the option separated by spaces, "
888
          "with zero nucleotides corresponding to all of A, C, G, and T")
889
    .bind_vec(&post_trim_poly_x_sink)
×
890
    .with_min_values(0);
×
891
  argparser.add("--trim-polyx-threshold", "N")
×
892
    .help("The minimum number of bases in a poly-X tail")
×
893
    .bind_u32(&trim_poly_x_threshold)
×
894
    .with_default(10);
×
895

896
  argparser.add_separator();
×
897
  argparser.add("--preserve5p")
×
898
    .help("If set, bases at the 5p will not be trimmed by when performing "
×
899
          "quality based trimming of reads. Merged reads will not be quality "
900
          "trimmed when this option is enabled [default: 5p bases are trimmed]")
901
    .bind_bool(&preserve5p);
×
902

903
  //////////////////////////////////////////////////////////////////////////////
904
  argparser.add_header("FILTERING:");
×
905

906
  argparser.add("--max-ns", "N")
×
907
    .help("Reads containing more ambiguous bases (N) than this number after "
×
908
          "trimming are discarded [default: no maximum]")
909
    .deprecated_alias("--maxns")
×
910
    .bind_u32(&max_ambiguous_bases)
×
911
    .with_default(std::numeric_limits<uint32_t>::max());
×
912

913
  argparser.add("--min-length", "N")
×
914
    .help("Reads shorter than this length following trimming are discarded")
×
915
    .deprecated_alias("--minlength")
×
916
    .bind_u32(&min_genomic_length)
×
917
    .with_default(15);
×
918
  argparser.add("--max-length", "N")
×
919
    .help("Reads longer than this length following trimming are discarded "
×
920
          "[default: no maximum]")
921
    .deprecated_alias("--maxlength")
×
922
    .bind_u32(&max_genomic_length)
×
923
    .with_default(std::numeric_limits<uint32_t>::max());
×
924

925
  argparser.add("--min-mean-quality", "N")
×
926
    .help("Reads with a mean Phred quality score less than this value "
×
927
          "following trimming are discarded. The value must be in the range 0 "
928
          "to 93, corresponding to Phred+33 encoded values of '!' to '~' "
929
          "[default: no minimum]")
930
    .bind_double(&min_mean_quality)
×
931
    .with_minimum(0.0)
×
932
    .with_maximum(PHRED_SCORE_MAX)
×
933
    .with_default(0.0);
×
934

935
  argparser.add("--min-complexity", "X")
×
936
    .help(
×
937
      "Filter reads with a complexity score less than this value. Complexity "
938
      "is measured as the fraction of positions that differ from the previous "
939
      "position. A suggested value is 0.3 [default: no minimum]")
940
    .bind_double(&min_complexity)
×
941
    .with_minimum(0.0)
×
942
    .with_maximum(1.0)
×
943
    .with_default(0);
×
944

945
  //////////////////////////////////////////////////////////////////////////////
946
  argparser.add_header("DEMULTIPLEXING:");
×
947

948
  argparser.add("--barcode-list", "FILE")
×
949
    .help("List of barcodes or barcode pairs for single or double-indexed "
×
950
          "demultiplexing. Note that both indexes should be specified for "
951
          "both single-end and paired-end trimming, if double-indexed "
952
          "multiplexing was used, in order to ensure that the demultiplexed "
953
          "reads can be trimmed correctly")
954
    .bind_str(&barcode_list);
×
955
  argparser.add("--multiple-barcodes")
×
956
    .help("Allow for more than one barcode (pair) for each sample. If this "
×
957
          "option is not specified, AdapterRemoval will abort if multiple "
958
          "barcodes/barcode pairs identify the same sample");
959
  argparser.add("--barcode-orientation", "X")
×
960
    .help("Detect barcodes in both the barcode1-insert-barcode2 (forward) "
×
961
          "orientation and barcode2-insert-barcode1 (reverse) orientation. "
962
          "Takes an optional argument specifying the orientation of the "
963
          "barcodes in the `--barcode-list`, defaulting to `forward`")
964
    .deprecated_alias("--reversible-barcodes")
×
965
    .depends_on("--barcode-list")
×
966
    .bind_str(nullptr)
×
967
    .with_default("unspecified")
×
968
    .with_implicit_argument("forward")
×
969
    .with_choices({ "unspecified", "forward", "reverse", "explicit" });
×
970
  argparser.add("--normalize-orientation")
×
971
    .help("Reverse complement merged reads found to be in the reverse "
×
972
          "orientation, based on barcodes")
973
    .depends_on("--barcode-orientation")
×
974
    .depends_on("--merge")
×
975
    .bind_bool(&normalize_orientation);
×
976

977
  argparser.add_separator();
×
978
  argparser.add("--barcode-mm", "N")
×
979
    .help("Maximum number of mismatches allowed when counting mismatches in "
×
980
          "both the mate 1 and the mate 2 barcode for paired reads")
981
    .bind_u32(&barcode_mm)
×
982
    .with_default(0);
×
983
  argparser.add("--barcode-mm-r1", "N")
×
984
    .help("Maximum number of mismatches allowed for the mate 1 barcode. "
×
985
          "Cannot be higher than the --barcode-mm value [default: same value "
986
          "as --barcode-mm]")
987
    .bind_u32(&barcode_mm_r1)
×
988
    .with_default(0);
×
989
  argparser.add("--barcode-mm-r2", "N")
×
990
    .help("Maximum number of mismatches allowed for the mate 2 barcode. "
×
991
          "Cannot be higher than the --barcode-mm value [default: same value "
992
          "as --barcode-mm]")
993
    .bind_u32(&barcode_mm_r2)
×
994
    .with_default(0);
×
995
  argparser.add("--demultiplex-only")
×
996
    .help("Only carry out demultiplexing using the list of barcodes "
×
997
          "supplied with --barcode-list. No other processing is done")
998
    .depends_on("--barcode-list")
×
999
    .conflicts_with("--report-only");
×
1000

1001
  //////////////////////////////////////////////////////////////////////////////
1002
  argparser.add_header("REPORTS:");
×
1003

1004
  argparser.add("--report-only")
×
1005
    .help("Write a report of the input data without performing any processing "
×
1006
          "of the FASTQ reads. Adapter sequence inference is performed for PE "
1007
          "data based on overlapping mate reads. A report including read "
1008
          "processing, but without output, can be generated by setting "
1009
          "--output options to /dev/null")
1010
    .deprecated_alias("--identify-adapters")
×
1011
    .conflicts_with("--barcode-list")
×
1012
    .conflicts_with("--benchmark")
×
1013
    .conflicts_with("--demultiplex-only");
×
1014

1015
  argparser.add("--report-title", "X")
×
1016
    .help("Title used for HTML report")
×
1017
    .bind_str(&report_title)
×
1018
    .with_default(NAME + " " + VERSION);
×
1019
  argparser.add("--report-sample-rate", "X")
×
1020
    .help("Fraction of reads to use when generating base quality/composition "
×
1021
          "curves for trimming reports. Using all data (--report-sample-nth "
1022
          "1.0) results in an 10-30% decrease in throughput")
1023
    .bind_double(&report_sample_rate)
×
1024
    .with_minimum(0.0)
×
1025
    .with_maximum(1.0)
×
1026
    .with_default(0.1);
×
1027
  argparser.add("--report-duplication", "N")
×
1028
    .help("FastQC based duplicate detection, based on the frequency of the "
×
1029
          "first N unique sequences observed. If no value is given, an N of "
1030
          "100k is used, corresponding to FastQC defaults; a value of 0 "
1031
          "disables the analysis. Accepts suffixes K, M, and G")
1032
    .bind_str(nullptr)
×
1033
    .with_implicit_argument("100k");
×
1034

1035
  //////////////////////////////////////////////////////////////////////////////
1036
  argparser.add_header("LOGGING:");
×
1037

1038
  argparser.add("--log-level", "X")
×
1039
    .help("The minimum severity of messages to be written to STDERR")
×
1040
    .bind_str(&log_level)
×
1041
    .with_choices({ "debug", "info", "warning", "error" })
×
1042
    .with_default("info");
×
1043

1044
  argparser.add("--log-colors", "X")
×
1045
    .help("Enable/disable the use of colors when writing log messages. If set "
×
1046
          "to auto, colors will only be enabled if STDERR is a terminal and "
1047
          "the NO_COLORS is environmental variable is not set")
1048
    .bind_str(&log_color)
×
1049
    .with_choices({ "auto", "always", "never" })
×
1050
    .with_default("auto");
×
1051
  argparser.add("--log-progress", "X")
×
1052
    .help("Specify the type of progress reports used. If set to auto, then a "
×
1053
          "spinner will be used if STDERR is a terminal and the NO_COLORS "
1054
          "environmental variable is not set, otherwise logging will be used")
1055
    .bind_str(nullptr)
×
1056
    .with_choices({ "auto", "log", "spin", "never" })
×
1057
    .with_default("auto");
×
1058
}
1059

1060
// Must be implemented out of line for unique ptrs
NEW
1061
userconfig::~userconfig() = default;
×
1062

1063
argparse::parse_result
1064
userconfig::parse_args(const string_vec& argvec)
×
1065
{
1066
  args = argvec;
×
NEW
1067
  auto& argparser = *m_argparser;
×
1068
  if (args.size() <= 1) {
×
1069
    argparser.print_help();
×
1070
    return argparse::parse_result::error;
1071
  }
1072

1073
  // ad-hoc arg parsing to make argparse output consistent with rest of run
1074
  configure_log_colors(try_parse_argument(args, "--log-color", "auto"), true);
×
1075
  configure_log_levels(try_parse_argument(args, "--log-level", "info"), true);
×
1076

1077
  const argparse::parse_result result = argparser.parse_args(args);
×
1078
  if (result != argparse::parse_result::ok) {
×
1079
    return result;
1080
  }
1081

1082
  configure_log_colors(log_color);
×
1083
  configure_log_levels(log_level);
×
1084
  log_progress = configure_log_progress(argparser.value("--log-progress"));
×
1085

1086
  {
×
1087
    const auto degenerate = argparser.is_set("--mask-degenerate-bases")
×
1088
                              ? degenerate_encoding::mask
×
1089
                              : degenerate_encoding::reject;
×
1090
    const auto uracils = argparser.is_set("--convert-uracils")
×
1091
                           ? uracil_encoding::convert
×
1092
                           : uracil_encoding::reject;
×
1093

1094
    io_encoding = configure_encoding(quality_input_base, degenerate, uracils);
×
1095
  }
1096

1097
  if (argparser.is_set("--mate-separator")) {
×
1098
    if (mate_separator_str.size() != 1) {
×
1099
      log::error() << "The argument for --mate-separator must be "
×
1100
                      "exactly one character long, not "
×
1101
                   << mate_separator_str.size() << " characters!";
×
1102
      return argparse::parse_result::error;
×
1103
    } else {
1104
      mate_separator = mate_separator_str.at(0);
×
1105
    }
1106
  }
1107

1108
  if (argparser.is_set("--demultiplex-only")) {
×
1109
    run_type = ar_command::demultiplex_only;
×
1110
  } else if (argparser.is_set("--report-only")) {
×
1111
    run_type = ar_command::report_only;
×
1112
  } else if (argparser.is_set("--benchmark")) {
×
1113
    run_type = ar_command::benchmark;
×
1114
  }
1115

1116
  {
×
1117
    const auto strategy = argparser.value("--quality-trimming");
×
1118
    if (strategy == "mott") {
×
1119
      trim = trimming_strategy::mott;
×
1120
      trim_mott_rate = fastq_encoding::phred_to_p(trim_mott_rate);
×
1121
    } else if (strategy == "window") {
×
1122
      trim = trimming_strategy::window;
×
1123
    } else if (strategy == "per-base") {
×
1124
      trim = trimming_strategy::per_base;
×
1125

1126
      if (!trim_low_quality_bases && !trim_ambiguous_bases) {
×
1127
        log::error() << "The per-base quality trimming strategy is enabled, "
×
1128
                     << "but neither trimming of low-quality bases (via "
×
1129
                     << "--trim-qualities) nor trimming of Ns (via --trim-ns) "
×
1130
                     << "is enabled.";
×
1131
        return argparse::parse_result::error;
×
1132
      }
1133
    } else if (strategy == "none") {
×
1134
      trim = trimming_strategy::none;
×
1135
    } else {
1136
      AR_FAIL(shell_escape(strategy));
×
1137
    }
1138
  }
1139

1140
  // Check for invalid combinations of settings
1141
  if (input_files_1.empty() && input_files_2.empty()) {
×
1142
    log::error()
×
1143
      << "No input files (--in-file1 / --in-file2) specified.\n"
×
1144
      << "Please specify at least one input file using --in-file1 FILENAME.";
×
1145

1146
    return argparse::parse_result::error;
×
1147
  } else if (!input_files_2.empty() &&
×
1148
             (input_files_1.size() != input_files_2.size())) {
×
1149
    log::error()
×
1150
      << "Different number of files specified for --in-file1 and --in-file2.";
×
1151

1152
    return argparse::parse_result::error;
×
1153
  } else if (!input_files_2.empty()) {
×
1154
    paired_ended_mode = true;
×
1155
  }
1156

1157
  interleaved_input |= interleaved;
×
1158
  interleaved_output |= interleaved;
×
1159

1160
  if (interleaved_input) {
×
1161
    // Enable paired end mode .. other than the FASTQ reader, all other
1162
    // parts of the pipeline simply run in paired-end mode.
1163
    paired_ended_mode = true;
×
1164
  }
1165

1166
  if (paired_ended_mode) {
×
1167
    min_adapter_overlap = 0;
×
1168

1169
    // merge related options implies --merge
1170
    if (argparser.is_set("--collapse-deterministic")) {
×
1171
      merge = merge_strategy::additive;
×
1172
    } else if (argparser.is_set("--collapse-conservatively")) {
×
1173
      merge = merge_strategy::maximum;
×
1174
    } else if (argparser.is_set("--merge") ||
×
1175
               argparser.is_set("--merge-strategy")) {
×
1176
      const auto strategy = argparser.value("--merge-strategy");
×
1177
      if (strategy == "maximum") {
×
1178
        merge = merge_strategy::maximum;
×
1179
      } else if (strategy == "additive") {
×
1180
        merge = merge_strategy::additive;
×
1181
      } else {
1182
        AR_FAIL(strategy);
×
1183
      }
1184
    }
1185
  }
1186

1187
  // (Optionally) read adapters from file and validate
1188
  if (!setup_adapter_sequences()) {
×
1189
    return argparse::parse_result::error;
1190
  }
1191

1192
  // (Optionally) read barcodes from file and validate
1193
  if (!setup_demultiplexing()) {
×
1194
    return argparse::parse_result::error;
1195
  }
1196

1197
  if (argparser.is_set("--read-group")) {
×
1198
    auto merged_tags = join_text(read_group, "\t");
×
1199

1200
    try {
×
NEW
1201
      samples->set_read_group(merged_tags);
×
1202
    } catch (const std::invalid_argument& error) {
×
1203
      log::error() << "Invalid argument --read-group "
×
1204
                   << log_escape(merged_tags) << ": " << error.what();
×
1205

1206
      return argparse::parse_result::error;
×
1207
    }
×
1208
  }
1209

1210
  // Set mismatch threshold
1211
  if (mismatch_threshold > 1) {
×
1212
    mismatch_threshold = 1.0 / mismatch_threshold;
×
1213
  } else if (mismatch_threshold < 0) {
×
1214
    if (run_type == ar_command::report_only) {
×
1215
      mismatch_threshold = 1.0 / 10.0;
×
1216
    } else {
1217
      // Defaults for PE / SE trimming (changed in v3)
1218
      mismatch_threshold = 1.0 / 6.0;
×
1219
    }
1220
  }
1221

1222
  {
×
1223
    bool found = false;
×
1224
    const auto simd_choice = argparser.value("--simd");
×
1225
    for (const auto is : simd::supported()) {
×
1226
      if (simd_choice == simd::name(is)) {
×
1227
        simd = is;
×
1228
        found = true;
×
1229
        break;
×
1230
      }
1231
    }
1232

1233
    AR_REQUIRE(found);
×
1234
  }
1235

1236
  using fixed_trimming =
×
1237
    std::tuple<const char*, const string_vec&, std::pair<unsigned, unsigned>&>;
1238

1239
  const std::vector<fixed_trimming> fixed_trimming_options = {
×
1240
#ifdef PRE_TRIM_5P
1241
    { "--pre-trim5p", pre_trim5p, pre_trim_fixed_5p },
1242
#endif
1243
    { "--pre-trim3p", pre_trim3p, pre_trim_fixed_3p },
×
1244
    { "--post-trim5p", post_trim5p, post_trim_fixed_5p },
×
1245
    { "--post-trim3p", post_trim3p, post_trim_fixed_3p },
×
1246
  };
1247

1248
  for (const auto& it : fixed_trimming_options) {
×
1249
    try {
×
1250
      if (argparser.is_set(std::get<0>(it))) {
×
1251
        std::get<2>(it) = parse_trim_argument(std::get<1>(it));
×
1252
      }
1253
    } catch (const std::invalid_argument& error) {
×
1254
      log::error() << "Could not parse " << std::get<0>(it)
×
1255
                   << " argument(s): " << error.what();
×
1256

1257
      return argparse::parse_result::error;
×
1258
    }
×
1259
  }
1260

1261
  if (!parse_output_formats(argparser, out_file_format, out_stdout_format)) {
×
1262
    return argparse::parse_result::error;
1263
  }
1264

1265
  // An empty prefix or directory would results in the creation of dot-files
1266
  if (out_prefix.empty()) {
×
1267
    log::error() << "--out-prefix must be a non-empty value.";
×
1268

1269
    return argparse::parse_result::error;
×
1270
  } else if (out_prefix.back() == '/') {
×
1271
    log::error() << "--out-prefix must not be a directory: "
×
1272
                 << shell_escape(out_prefix);
×
1273

1274
    return argparse::parse_result::error;
×
1275
  } else if (out_prefix == DEV_NULL && run_type != ar_command::benchmark) {
×
1276
    // Relevant output options depend on input files and other settings
1277
    const std::vector<std::pair<std::string, bool>> output_keys = {
×
1278
      { "--out-file1",
1279
        is_adapter_trimming_enabled() || is_demultiplexing_enabled() },
×
1280
      { "--out-file2",
1281
        is_adapter_trimming_enabled() || is_demultiplexing_enabled() },
×
1282
      { "--out-singleton", is_any_filtering_enabled() },
×
1283
      { "--out-merged", is_read_merging_enabled() },
×
1284
      { "--out-discarded", is_any_filtering_enabled() },
×
1285
      { "--out-unidentified1", is_demultiplexing_enabled() },
×
1286
      { "--out-unidentified2", is_demultiplexing_enabled() },
×
1287
      { "--out-json", true },
1288
      { "--out-html", true },
1289
      { "--out-prefix", true },
1290
    };
1291

1292
    string_vec required_keys;
×
1293
    for (const auto& it : output_keys) {
×
1294
      if (it.second) {
×
1295
        required_keys.push_back(it.first);
×
1296
      }
1297
    }
1298

1299
    const auto user_keys = user_supplied_keys(argparser, required_keys);
×
1300
    if (user_keys.empty()) {
×
1301
      auto error = log::error();
×
1302
      error << "No output would be generated; at least one of the options "
×
1303
            << join_text(required_keys, ", ", ", or ")
×
1304
            << " must be used. The --out-prefix option automatically enables "
1305
               "all relevant --out options.";
×
1306

1307
      return argparse::parse_result::error;
×
1308
    }
1309
  }
1310

1311
  {
×
1312
    const std::string key = "--pre-trim-polyx";
×
1313
    if (argparser.is_set(key) &&
×
1314
        !parse_poly_x_option(key, pre_trim_poly_x_sink, pre_trim_poly_x)) {
×
1315
      return argparse::parse_result::error;
×
1316
    }
1317
  }
1318

1319
  {
×
1320
    const std::string key = "--post-trim-polyx";
×
1321
    if (argparser.is_set(key) &&
×
1322
        !parse_poly_x_option(key, post_trim_poly_x_sink, post_trim_poly_x)) {
×
1323
      return argparse::parse_result::error;
×
1324
    }
1325
  }
1326

1327
  if (!min_genomic_length) {
×
1328
    log::warn() << "--min-length is set to 0. This may produce FASTQ files "
×
1329
                   "that are incompatible with some tools!";
×
1330
  }
1331

1332
  // Default to all reads, but don't print value with --help
1333
  head = std::numeric_limits<uint64_t>::max();
×
1334
  if (!parse_counts(argparser, "--head", head)) {
×
1335
    return argparse::parse_result::error;
1336
  }
1337

1338
  if (!parse_counts(argparser, "--report-duplication", report_duplication)) {
×
1339
    return argparse::parse_result::error;
1340
  }
1341

1342
  return argparse::parse_result::ok;
1343
}
1344

1345
bool
1346
userconfig::is_good_alignment(const alignment_info& alignment) const
×
1347
{
1348
  if (!alignment.length || alignment.score() <= 0) {
×
1349
    return false;
×
1350
  }
1351

1352
  // Only pairs of called bases are considered part of the alignment
1353
  const size_t n_aligned = alignment.length - alignment.n_ambiguous;
×
1354
  if (n_aligned < min_adapter_overlap && !paired_ended_mode) {
×
1355
    return false;
1356
  }
1357

1358
  auto mm_threshold = static_cast<size_t>(mismatch_threshold * n_aligned);
×
1359
  if (n_aligned < 6) {
×
1360
    mm_threshold = 0;
×
1361
  } else if (n_aligned < 10) {
×
1362
    // Allow at most 1 mismatch, possibly set to 0 by the user
1363
    mm_threshold = std::min<size_t>(1, mm_threshold);
×
1364
  }
1365

1366
  return alignment.n_mismatches <= mm_threshold;
×
1367
}
1368

1369
bool
1370
userconfig::can_merge_alignment(const alignment_info& alignment) const
×
1371
{
1372
  if (alignment.length < alignment.n_ambiguous) {
×
1373
    throw std::invalid_argument("#ambiguous bases > read length");
×
1374
  }
1375

1376
  return alignment.length - alignment.n_ambiguous >= merge_threshold;
×
1377
}
1378

1379
output_format
1380
userconfig::infer_output_format(const std::string& filename) const
×
1381
{
1382
  if (filename == DEV_STDOUT) {
×
1383
    return out_stdout_format;
×
1384
  }
1385

1386
  output_format result = out_file_format;
×
1387
  // Parse failures are ignored here; default to --out-format
1388
  output_files::parse_extension(filename, result);
×
1389

1390
  return result;
×
1391
}
1392

1393
output_files
1394
userconfig::get_output_filenames() const
×
1395
{
1396
  output_files files;
×
1397

1398
  files.settings_json = new_output_file("--out-json", {}, {}, ".json").name;
×
1399
  files.settings_html = new_output_file("--out-html", {}, {}, ".html").name;
×
1400

1401
  auto ext = output_files::file_extension(out_file_format);
×
1402
  std::string_view out1 = interleaved_output ? "" : ".r1";
×
1403
  std::string_view out2 = interleaved_output ? "" : ".r2";
×
1404

1405
  if (is_demultiplexing_enabled()) {
×
1406
    files.unidentified_1 = new_output_file("--out-unidentified1",
×
1407
                                           {},
1408
                                           { ".unidentified", out1 },
1409
                                           ext);
×
1410

1411
    if (paired_ended_mode) {
×
1412
      if (interleaved_output) {
×
1413
        files.unidentified_2 = files.unidentified_1;
×
1414
      } else {
1415
        files.unidentified_2 = new_output_file("--out-unidentified2",
×
1416
                                               {},
1417
                                               { ".unidentified", out2 },
1418
                                               ext);
×
1419
      }
1420
    }
1421
  }
1422

NEW
1423
  for (const auto& sample : *samples) {
×
1424
    const auto& name = sample.name();
×
1425
    sample_output_files map;
×
1426

1427
    const auto mate_1 = new_output_file("--out-file1", name, { out1 }, ext);
×
1428
    map.set_file(read_file::mate_1, mate_1);
×
1429

1430
    if (paired_ended_mode) {
×
1431
      if (interleaved_output) {
×
1432
        map.set_file(read_file::mate_2, mate_1);
×
1433
      } else {
1434
        map.set_file(read_file::mate_2,
×
1435
                     new_output_file("--out-file2", name, { out2 }, ext));
×
1436
      }
1437
    }
1438

1439
    if (run_type == ar_command::trim_adapters) {
×
1440
      if (is_any_filtering_enabled()) {
×
1441
        map.set_file(
×
1442
          read_file::discarded,
1443
          new_output_file("--out-discarded", name, { ".discarded" }, ext));
×
1444
      }
1445

1446
      if (paired_ended_mode) {
×
1447
        if (is_any_filtering_enabled()) {
×
1448
          map.set_file(
×
1449
            read_file::singleton,
1450
            new_output_file("--out-singleton", name, { ".singleton" }, ext));
×
1451
        }
1452

1453
        if (is_read_merging_enabled()) {
×
1454
          map.set_file(
×
1455
            read_file::merged,
1456
            new_output_file("--out-merged", name, { ".merged" }, ext));
×
1457
        }
1458
      }
1459
    }
1460

1461
    files.add_sample(std::move(map));
×
1462
  }
1463

1464
  return files;
×
1465
}
×
1466

1467
output_file
1468
userconfig::new_output_file(const std::string& key,
×
1469
                            std::string_view sample,
1470
                            std::vector<std::string_view> keys,
1471
                            std::string_view ext) const
1472
{
1473
  AR_REQUIRE(!ext.empty());
×
1474
  const auto default_is_fastq = out_file_format == output_format::fastq ||
×
1475
                                out_file_format == output_format::fastq_gzip;
1476

1477
  std::string out;
×
NEW
1478
  if (m_argparser->is_set(key)) {
×
NEW
1479
    out = m_argparser->value(key);
×
1480

1481
    // global files, e.g. reports and unidentified reads
1482
    if (sample.empty()) {
×
1483
      return { out, infer_output_format(out) };
×
1484
    }
1485
  } else if (default_is_fastq && key == "--out-discarded") {
×
1486
    // Discarded reads are dropped by default for non-archival formats
1487
    out = DEV_NULL;
×
1488
  } else {
1489
    out = out_prefix;
×
1490
  }
1491

1492
  if (out == DEV_NULL) {
×
1493
    return { out, output_format::fastq };
×
1494
  }
1495

1496
  if (!(default_is_fastq || keys.empty())) {
×
1497
    // SAM/BAM files are combined by default
1498
    keys.pop_back();
×
1499
  }
1500

1501
  if (!sample.empty()) {
×
1502
    keys.insert(keys.begin(), sample);
×
1503
  }
1504

1505
  keys.emplace_back(ext);
×
1506

1507
  for (const auto& value : keys) {
×
1508
    if (!value.empty() && value.front() != '.') {
×
1509
      out.push_back('.');
×
1510
    }
1511

1512
    out.append(value);
×
1513
  }
1514

1515
  return output_file{ out, infer_output_format(out) };
×
1516
}
1517

1518
bool
1519
check_and_set_barcode_mm(const argparse::parser& argparser,
×
1520
                         const std::string& key,
1521
                         uint32_t barcode_mm,
1522
                         uint32_t& dst)
1523
{
1524
  if (!argparser.is_set(key)) {
×
1525
    dst = barcode_mm;
×
1526
  } else if (dst > barcode_mm) {
×
1527
    log::error()
×
1528
      << "The maximum number of errors for " << key
×
1529
      << " is set \n"
1530
         "to a higher value than the total number of mismatches allowed\n"
1531
         "for barcodes (--barcode-mm). Please correct these settings.";
×
1532
    return false;
×
1533
  }
1534

1535
  return true;
1536
}
1537

1538
bool
1539
userconfig::is_adapter_trimming_enabled() const
×
1540
{
1541
  return run_type == ar_command::trim_adapters;
×
1542
}
1543

1544
bool
1545
userconfig::is_demultiplexing_enabled() const
×
1546
{
1547
  return !barcode_list.empty();
×
1548
}
1549

1550
bool
1551
userconfig::is_read_merging_enabled() const
×
1552
{
1553
  return is_adapter_trimming_enabled() && merge != merge_strategy::none;
×
1554
}
1555

1556
bool
1557
userconfig::is_any_quality_trimming_enabled() const
×
1558
{
1559
  return is_adapter_trimming_enabled() &&
×
1560
         (is_low_quality_trimming_enabled() ||
×
1561
          is_terminal_base_pre_trimming_enabled() ||
×
1562
          is_terminal_base_post_trimming_enabled() ||
×
1563
          is_poly_x_tail_pre_trimming_enabled() ||
×
1564
          is_poly_x_tail_post_trimming_enabled());
×
1565
}
1566

1567
bool
1568
userconfig::is_low_quality_trimming_enabled() const
×
1569
{
1570
  return trim != trimming_strategy::none;
×
1571
}
1572

1573
bool
1574
userconfig::is_terminal_base_pre_trimming_enabled() const
×
1575
{
1576
  return
×
1577
#ifdef PRE_TRIM_5P
1578
    pre_trim_fixed_5p.first || pre_trim_fixed_5p.second ||
1579
#endif
1580
    pre_trim_fixed_3p.first || pre_trim_fixed_3p.second;
×
1581
}
1582

1583
bool
1584
userconfig::is_terminal_base_post_trimming_enabled() const
×
1585
{
1586
  return post_trim_fixed_5p.first || post_trim_fixed_5p.second ||
×
1587
         post_trim_fixed_3p.first || post_trim_fixed_3p.second;
×
1588
}
1589

1590
bool
1591
userconfig::is_poly_x_tail_pre_trimming_enabled() const
×
1592
{
1593
  return !pre_trim_poly_x.empty();
×
1594
}
1595

1596
bool
1597
userconfig::is_poly_x_tail_post_trimming_enabled() const
×
1598
{
1599
  return !post_trim_poly_x.empty();
×
1600
}
1601

1602
bool
1603
userconfig::is_any_filtering_enabled() const
×
1604
{
1605
  return is_adapter_trimming_enabled() &&
×
1606
         (is_short_read_filtering_enabled() ||
×
1607
          is_long_read_filtering_enabled() ||
×
1608
          is_ambiguous_base_filtering_enabled() ||
×
1609
          is_mean_quality_filtering_enabled() ||
×
1610
          is_low_complexity_filtering_enabled());
×
1611
}
1612

1613
bool
1614
userconfig::is_short_read_filtering_enabled() const
×
1615
{
1616
  return min_genomic_length > 0;
×
1617
}
1618

1619
bool
1620
userconfig::is_long_read_filtering_enabled() const
×
1621
{
1622
  return max_genomic_length !=
×
1623
         std::numeric_limits<decltype(max_genomic_length)>::max();
×
1624
}
1625

1626
bool
1627
userconfig::is_ambiguous_base_filtering_enabled() const
×
1628
{
1629
  return max_ambiguous_bases !=
×
1630
         std::numeric_limits<decltype(max_ambiguous_bases)>::max();
×
1631
}
1632

1633
bool
1634
userconfig::is_mean_quality_filtering_enabled() const
×
1635
{
1636
  return min_mean_quality > 0;
×
1637
}
1638

1639
bool
1640
userconfig::is_low_complexity_filtering_enabled() const
×
1641
{
1642
  return min_complexity > 0;
×
1643
}
1644

1645
bool
1646
userconfig::setup_adapter_sequences()
×
1647
{
1648
  adapter_set adapters;
×
NEW
1649
  if (m_argparser->is_set("--adapter-list")) {
×
1650
    try {
×
1651
      adapters.load(adapter_list, paired_ended_mode);
×
1652
    } catch (const std::exception& error) {
×
1653
      log::error() << "Error reading adapters from " << log_escape(adapter_list)
×
1654
                   << ": " << error.what();
×
1655
      return false;
×
1656
    }
×
1657

1658
    log::info() << "Read " << adapters.size()
×
1659
                << " adapters / adapter pairs from '" << adapter_list << "'";
×
1660
  } else {
1661
    try {
×
1662
      adapters.add(dna_sequence{ adapter_1 }, dna_sequence{ adapter_2 });
×
1663
    } catch (const fastq_error& error) {
×
1664
      log::error() << "Error parsing adapter sequence(s):\n"
×
1665
                   << "   " << error.what();
×
1666

1667
      return false;
×
1668
    }
×
1669
  }
1670

NEW
1671
  samples->set_adapters(std::move(adapters));
×
1672

1673
  return true;
×
1674
}
1675

1676
bool
1677
userconfig::setup_demultiplexing()
×
1678
{
NEW
1679
  auto& argparser = *m_argparser;
×
1680
  if (!argparser.is_set("--barcode-mm")) {
×
1681
    barcode_mm = barcode_mm_r1 + barcode_mm_r2;
×
1682
  }
1683

1684
  if (!check_and_set_barcode_mm(argparser,
×
1685
                                "--barcode-mm-r1",
1686
                                barcode_mm,
1687
                                barcode_mm_r1)) {
×
1688
    return false;
1689
  }
1690

1691
  if (!check_and_set_barcode_mm(argparser,
×
1692
                                "--barcode-mm-r2",
1693
                                barcode_mm,
1694
                                barcode_mm_r2)) {
×
1695
    return false;
1696
  }
1697

1698
  if (argparser.is_set("--barcode-list")) {
×
1699
    const auto orientation =
×
1700
      parse_table_orientation(argparser.value("--barcode-orientation"));
×
1701

1702
    barcode_config config;
×
1703
    config.paired_end_mode(paired_ended_mode)
×
1704
      .allow_multiple_barcodes(argparser.is_set("--multiple-barcodes"))
×
1705
      .orientation(orientation);
×
1706

1707
    try {
×
NEW
1708
      samples->load(barcode_list, config);
×
1709
    } catch (const std::exception& error) {
×
1710
      log::error() << "Error reading barcodes from " << log_escape(barcode_list)
×
1711
                   << ": " << error.what();
×
1712
      return false;
×
1713
    }
×
1714

NEW
1715
    log::info() << "Read " << samples->size() << " sets of barcodes from "
×
1716
                << shell_escape(barcode_list);
×
1717
  }
1718

1719
  const auto& output_files = get_output_filenames();
×
1720

1721
  return check_input_files(input_files_1, input_files_2) &&
×
1722
         check_output_files("--in-file1", input_files_1, output_files) &&
×
1723
         check_output_files("--in-file2", input_files_2, output_files);
×
1724
}
1725

1726
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc