• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #80

06 Apr 2025 08:21PM UTC coverage: 27.836% (+0.1%) from 27.703%
#80

push

travis-ci

web-flow
improvements to utility functions (#104)

* move is_ascii_* helperfunction to strutils
* update join_text to support any containers
* use std::device_random to generate RNG seeds
* add function for getting an underlying enum value
* addition of / improvements to / tweaks to related tests

20 of 30 new or added lines in 9 files covered. (66.67%)

1 existing line in 1 file now uncovered.

2719 of 9768 relevant lines covered (27.84%)

4127.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

52.9
/src/sequence_sets.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
3
// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
4
#include "sequence_sets.hpp" // declarations
5
#include "debug.hpp"         // for AR_REQUIRE
6
#include "errors.hpp"        // for fastq_error
7
#include "linereader.hpp"    // for line_reader
8
#include "sequence.hpp"      // for dna_sequence
9
#include "strutils.hpp"      // for string_vec, indent_lines
10
#include "table_reader.hpp"  // for table_reader
11
#include <algorithm>         // for max, sort, find
12
#include <sstream>           // for operator<<, basic_ostream, ostringstream
13
#include <stdexcept>         // for invalid_argument
14
#include <string_view>       // for string_view
15
#include <utility>           // for pair
16
#include <vector>            // for vector, vector<>::const_iterator
17

18
namespace adapterremoval {
19

20
namespace {
21

22
void
23
validate_sample_name(std::string_view name)
158✔
24
{
25
  if (name.empty()) {
158✔
26
    throw parsing_error("Sample names must not be empty");
×
27
  } else if (name == "unidentified") {
158✔
28
    throw parsing_error("The sample name 'unidentified' is a reserved "
×
29
                        "name, and cannot be used!");
×
30
  }
31

32
  for (const char c : name) {
1,394✔
33
    if (!is_ascii_letter_or_digit(c) && (c != '_')) {
1,078✔
UNCOV
34
      std::ostringstream error;
×
35
      error << "The sample name " << log_escape(name)
×
36
            << " is not a valid sample name; only letters ('a' to 'z' and 'A' "
37
               "to 'Z'), numbers (0 to 9) and underscores (_) are allowed.";
×
38

39
      throw parsing_error(error.str());
×
40
    }
×
41
  }
42
}
158✔
43

44
void
45
validate_barcode_sequence(const dna_sequence& sequence,
153✔
46
                          const size_t expected_length,
47
                          const int mate)
48
{
49
  std::string_view seq = sequence;
153✔
50

51
  if (seq.find('N') != std::string::npos) {
153✔
52
    std::ostringstream error;
×
53
    error << "Degenerate base (N) found in mate " << mate << " barcode "
×
54
          << "sequence " << log_escape(seq) << ". Degenerate bases are not "
×
55
          << "supported for demultiplexing; please remove before continuing!";
×
56

57
    throw parsing_error(error.str());
×
58
  }
×
59

60
  if (sequence.length() != expected_length) {
306✔
61
    std::ostringstream error;
6✔
62
    error << "Inconsistent mate " << mate << " barcode lengths found: Last "
6✔
63
          << "barcode was " << expected_length << " base-pairs long, but "
6✔
64
          << "barcode " << log_escape(seq) << " is " << seq.length() << " "
12✔
65
          << "base-pairs long. Variable length barcodes are not supported";
6✔
66

67
    throw parsing_error(error.str());
12✔
68
  }
6✔
69
}
147✔
70

71
bool
72
check_barcodes_sequences(const std::vector<sample>& samples,
39✔
73
                         const std::string& source,
74
                         bool paired_end = false)
75
{
76
  if (samples.empty()) {
39✔
77
    throw parsing_error("No samples/barcodes provided");
×
78
  }
79

80
  auto mate_1_len = static_cast<size_t>(-1);
39✔
81
  auto mate_2_len = static_cast<size_t>(-1);
39✔
82

83
  std::vector<std::pair<std::string_view, std::string_view>> sequences;
39✔
84
  for (const auto& sample : samples) {
190✔
85
    validate_sample_name(sample.name());
158✔
86

87
    for (const auto& it : sample) {
310✔
88
      if (mate_1_len == static_cast<size_t>(-1)) {
79✔
89
        mate_1_len = it.barcode_1.length();
39✔
90
        mate_2_len = it.barcode_2.length();
39✔
91

92
        if (!mate_1_len) {
39✔
93
          throw parsing_error("Empty barcode 1 sequence for " + sample.name());
×
94
        }
95
      }
96

97
      validate_barcode_sequence(it.barcode_1, mate_1_len, 1);
79✔
98
      validate_barcode_sequence(it.barcode_2, mate_2_len, 2);
74✔
99

100
      if (paired_end) {
73✔
101
        sequences.emplace_back(it.barcode_1, it.barcode_2);
73✔
102
      } else {
103
        sequences.emplace_back(it.barcode_1, dna_sequence{});
×
104
      }
105
    }
106
  }
107

108
  std::sort(sequences.begin(), sequences.end());
99✔
109
  for (size_t i = 1; i < sequences.size(); ++i) {
65✔
110
    if (sequences.at(i - 1) == sequences.at(i)) {
34✔
111
      const auto& it = sequences.at(i);
2✔
112

113
      if (paired_end) {
2✔
114
        std::ostringstream error;
2✔
115
        error << "Duplicate barcode pairs found in " << log_escape(source)
4✔
116
              << " with barcodes " << log_escape(it.first) << " and "
4✔
117
              << log_escape(it.second) << ". please verify correctness of "
4✔
118
              << "the barcode table and remove any duplicate entries!";
8✔
119

120
        throw parsing_error(error.str());
4✔
121
      } else {
2✔
122
        std::ostringstream error;
×
123
        error << "Duplicate mate 1 barcodes found in " << log_escape(source)
×
124
              << ": " << log_escape(it.first) << ". Even if these are "
×
125
              << "associated with different mate 2 barcodes, it is not "
126
                 "possible to distinguish between these in single-end mode!";
×
127

128
        throw parsing_error(error.str());
×
129
      }
×
130
    }
131
  }
132

133
  return true;
31✔
134
}
39✔
135

136
} // namespace
137

138
///////////////////////////////////////////////////////////////////////////////
139
// Implementations for `read_group`
140

141
read_group::read_group()
162✔
142
  : m_header{ "@RG\tID:1" }
324✔
143
  , m_id{ "1" }
324✔
144
{
145
}
162✔
146

147
read_group::read_group(std::string_view value)
18✔
148
  : m_header{ "@RG" }
50✔
149
{
150
  using invalid = std::invalid_argument;
18✔
151

152
  // It's not unreasonable to except users to try to specify a full @RG line
153
  if (starts_with(value, "@RG\t")) {
18✔
154
    value = value.substr(4);
5✔
155
  }
156

157
  if (!value.empty()) {
18✔
158
    for (const auto& field : split_text(value, '\t')) {
62✔
159
      for (const auto c : field) {
109✔
160
        if (c < ' ' || c > '~') {
92✔
161
          throw invalid("only characters in the range ' ' to '~' are allowed");
1✔
162
        }
163
      }
164

165
      if (field.size() < 4) {
34✔
166
        throw invalid("tags must be at least 4 characters long");
1✔
167
      } else if (!is_ascii_letter(field.at(0))) {
16✔
168
        throw invalid("first character in tag name must be a letter");
2✔
169
      } else if (!is_ascii_letter_or_digit(field.at(1))) {
14✔
170
        throw invalid(
1✔
171
          "second character in tag name must be a letter or number");
1✔
172
      } else if (field.at(2) != ':') {
13✔
173
        throw invalid("third character in tag must be a colon");
1✔
174
      } else if (starts_with(field, "ID:")) {
12✔
175
        if (!m_id.empty()) {
12✔
176
          throw invalid("multiple ID tags found");
1✔
177
        }
178

179
        m_id = field.substr(3);
10✔
180
      }
181
    }
17✔
182
  }
183

184
  // Generic read-group key if the user did not supply one
185
  if (m_id.empty()) {
22✔
186
    m_id = "1";
7✔
187
    m_header += "\tID:1";
7✔
188
  }
189

190
  if (!value.empty()) {
11✔
191
    m_header += "\t";
10✔
192
    m_header += value;
10✔
193
  }
194
}
32✔
195

196
void
197
read_group::set_id(std::string_view id)
4✔
198
{
199
  AR_REQUIRE(!id.empty());
4✔
200
  update_tag("ID", id);
4✔
201
  m_id = id;
4✔
202
}
4✔
203

204
void
205
read_group::update_tag(std::string_view key, std::string_view value)
55✔
206
{
207
  AR_REQUIRE(!key.empty());
55✔
208
  std::string cache;
55✔
209
  cache.push_back('\t');
55✔
210
  cache.append(key);
55✔
211
  cache.push_back(':');
55✔
212

213
  auto index = m_header.find(cache);
55✔
214
  if (index != std::string::npos) {
55✔
215
    if (value.empty()) {
12✔
216
      cache = m_header.substr(0, index);
8✔
217
    } else {
218
      cache = m_header.substr(0, index + cache.size());
24✔
219
      cache.append(value);
8✔
220
    }
221

222
    index = m_header.find('\t', index + 1);
12✔
223
    if (index != std::string::npos) {
12✔
224
      cache.append(m_header.substr(index));
2✔
225
    }
226

227
    m_header = cache;
24✔
228
  } else if (!value.empty()) {
43✔
229
    m_header.append(cache);
43✔
230
    m_header.append(value);
43✔
231
  }
232
}
55✔
233

234
///////////////////////////////////////////////////////////////////////////////
235
// Implementations for 'adapters' class
236

237
adapter_set::adapter_set(std::initializer_list<string_view_pair> args)
117✔
238
{
239
  for (const auto& pair : args) {
243✔
240
    add(std::string{ pair.first }, std::string{ pair.second });
378✔
241
  }
242
}
117✔
243

244
void
245
adapter_set::add(dna_sequence adapter1, dna_sequence adapter2)
126✔
246
{
247
  m_adapters.emplace_back(adapter1, adapter2.reverse_complement());
252✔
248
}
126✔
249

250
void
251
adapter_set::add(std::string adapter1, const std::string adapter2)
126✔
252
{
253
  add(dna_sequence{ adapter1 }, dna_sequence{ adapter2 });
630✔
254
}
126✔
255

256
adapter_set
257
adapter_set::add_barcodes(const dna_sequence& barcode1,
×
258
                          const dna_sequence& barcode2) const
259
{
260
  adapter_set adapters;
×
261
  for (const auto& it : m_adapters) {
×
262
    // Add sequences directly in alignment orientation
263
    adapters.m_adapters.emplace_back(barcode2.reverse_complement() + it.first,
×
264
                                     it.second + barcode1);
×
265
  }
266

267
  return adapters;
×
268
}
×
269

270
void
271
adapter_set::load(const std::string& filename, bool paired_end)
×
272
{
273
  line_reader reader(filename);
×
274
  const auto adapters = table_reader()
×
275
                          .with_comment_char('#')
×
276
                          .with_min_columns(1 + paired_end)
×
277
                          .with_max_columns(2)
×
278
                          .parse(reader);
×
279

280
  for (const auto& row : adapters) {
×
281
    dna_sequence adapter_1{ row.at(0) };
×
282
    dna_sequence adapter_2;
×
283
    if (row.size() > 1) {
×
284
      adapter_2 = dna_sequence{ row.at(1) };
×
285
    }
286

287
    // Convert from read to alignment orientation
288
    m_adapters.emplace_back(adapter_1, adapter_2.reverse_complement());
×
289
  }
290
}
291

292
sequence_pair_vec
293
adapter_set::to_read_orientation() const
×
294
{
295
  sequence_pair_vec adapters;
×
296
  for (const auto& it : m_adapters) {
×
297
    adapters.emplace_back(it.first, it.second.reverse_complement());
×
298
  }
299

300
  return adapters;
×
301
}
×
302

303
///////////////////////////////////////////////////////////////////////////////
304
// Implementations for 'sample' class
305

306
void
307
sample::add(dna_sequence barcode1, dna_sequence barcode2)
118✔
308
{
309
  m_barcodes.emplace_back(std::move(barcode1), std::move(barcode2));
118✔
310
}
118✔
311

312
void
313
sample::add(std::string barcode1, std::string barcode2)
×
314
{
315
  add(dna_sequence(barcode1), dna_sequence(barcode2));
×
316
}
317

318
void
319
sample::set_adapters(const adapter_set& adapters)
×
320
{
321
  for (auto& it : m_barcodes) {
×
322
    it.adapters = adapters.add_barcodes(it.barcode_1, it.barcode_2);
×
323
  }
324
}
325

326
void
327
sample::set_read_group(const read_group& info)
39✔
328
{
329
  for (auto it = m_barcodes.begin(); it != m_barcodes.end(); ++it) {
195✔
330
    it->info = info;
39✔
331
    it->has_read_group = true;
39✔
332

333
    if (!m_name.empty()) {
78✔
334
      it->info.set_sample(m_name);
×
335

336
      if (m_barcodes.size() > 1) {
×
337
        std::string id = m_name;
×
338
        id.push_back('.');
×
339
        id.append(std::to_string((it - m_barcodes.begin()) + 1));
×
340

341
        it->info.set_id(id);
×
342
      } else {
×
343
        it->info.set_id(m_name);
×
344
      }
345
    }
346

347
    if (it->barcode_1.length() || it->barcode_2.length()) {
156✔
348
      std::string barcodes;
×
349
      barcodes.append(it->barcode_1);
×
350
      if (it->barcode_2.length()) {
×
351
        barcodes.push_back('-');
×
352
        barcodes.append(it->barcode_2);
×
353
      }
354

355
      it->info.set_barcodes(barcodes);
×
356
    }
357
  }
358
}
39✔
359

360
///////////////////////////////////////////////////////////////////////////////
361
// Implementations for 'sample_set' class
362

363
sample_set::sample_set()
×
364
  : m_samples{ sample{} }
×
365
  , m_unidentified("", dna_sequence{}, dna_sequence{})
×
366
{
367
  set_unidentified_read_group(m_read_group);
×
368
}
369

370
sample_set::sample_set(std::initializer_list<sample> args)
39✔
371
  : m_samples(args)
86✔
372
  , m_unidentified("", dna_sequence{}, dna_sequence{})
258✔
373
{
374
  set_unidentified_read_group(m_read_group);
78✔
375

376
  std::sort(m_samples.begin(),
78✔
377
            m_samples.end(),
39✔
378
            [](const auto& a, const auto& b) { return a.name() < b.name(); });
240✔
379

380
  std::string_view name;
39✔
381
  for (const auto& sample : m_samples) {
196✔
382
    validate_sample_name(sample.name());
158✔
383
    if (sample.name() == name) {
158✔
384
      throw parsing_error("duplicate sample name: " + sample.name());
×
385
    }
386

387
    name = sample.name();
158✔
388
  }
389

390
  check_barcodes_sequences(m_samples, "initializer_list", true);
94✔
391
}
87✔
392

393
void
394
sample_set::set_adapters(adapter_set adapters)
×
395
{
396
  m_adapters = std::move(adapters);
×
397
  for (auto& sample : m_samples) {
×
398
    sample.set_adapters(m_adapters);
×
399
  }
400

401
  m_unidentified.set_adapters(m_adapters);
×
402
}
403

404
void
405
sample_set::set_read_group(std::string_view value)
×
406
{
407
  m_read_group = read_group(value);
×
408
  for (auto& sample : m_samples) {
×
409
    sample.set_read_group(m_read_group);
×
410
  }
411

412
  set_unidentified_read_group(m_read_group);
×
413
}
414

415
void
416
sample_set::load(const std::string& filename, const barcode_config& config)
×
417
{
418
  line_reader reader(filename);
×
419
  auto barcodes = table_reader()
×
420
                    .with_comment_char('#')
×
421
                    .with_min_columns(2 + !config.m_unidirectional_barcodes)
×
422
                    .with_max_columns(3)
×
423
                    .parse(reader);
×
424

425
  std::sort(barcodes.begin(), barcodes.end(), [](const auto& a, const auto& b) {
×
426
    return a.at(0) < b.at(0);
×
427
  });
428

429
  std::vector<sample> samples{};
×
430
  for (const auto& row : barcodes) {
×
431
    const auto& name = row.at(0);
×
432
    dna_sequence barcode_1{ row.at(1) };
×
433
    dna_sequence barcode_2;
×
434

435
    if (row.size() > 2) {
×
436
      barcode_2 = dna_sequence{ row.at(2) };
×
437
    }
438

439
    if (samples.empty() || samples.back().name() != name) {
×
440
      samples.emplace_back(name, barcode_1, barcode_2);
×
441
    } else if (config.m_allow_multiple_barcodes) {
×
442
      samples.back().add(barcode_1, barcode_2);
×
443
    } else {
444
      std::ostringstream error;
×
445
      error << "Duplicate sample name " << log_escape(name)
×
446
            << "; multiple barcodes per samples is not enabled. Either ensure "
447
               "that all sample names are unique or use --multiple-barcodes to "
448
               "map multiple barcodes to a single sample";
×
449

450
      throw parsing_error(error.str());
×
451
    }
×
452
  }
453

454
  // Check before adding reversed barcodes, to prevent misleading error messages
455
  check_barcodes_sequences(samples, filename, config.m_paired_end_mode);
×
456

457
  std::swap(m_samples, samples);
×
458
  if (!config.m_unidirectional_barcodes) {
×
459
    add_reversed_barcodes(config);
×
460
  }
461

462
  for (auto& s : m_samples) {
×
463
    s.set_read_group(m_read_group);
×
464
    s.set_adapters(m_adapters);
×
465
  }
466
}
467

468
void
469
sample_set::set_unidentified_read_group(read_group tmpl)
39✔
470
{
471
  // Unidentified reads lack a SM tag, so add a comment instead
472
  tmpl.set_comment("unidentified");
39✔
473
  m_unidentified.set_read_group(tmpl);
39✔
474
}
39✔
475

476
void
477
sample_set::add_reversed_barcodes(const barcode_config& config)
×
478
{
479
  for (auto& sample : m_samples) {
×
480
    // Original number of barcodes
481
    const size_t count = sample.size();
×
482

483
    for (size_t i = 0; i < count; ++i) {
×
484
      const auto& sequences = sample.at(i);
×
485
      auto barcode_1 = sequences.barcode_2.reverse_complement();
×
486
      auto barcode_2 = sequences.barcode_1.reverse_complement();
×
487
      AR_REQUIRE(barcode_1.length() && barcode_2.length());
×
488

489
      bool reverse_found = false;
×
490
      for (const auto& it : sample) {
×
491
        if (it.barcode_1 == barcode_1 && it.barcode_2 == barcode_2) {
×
492
          reverse_found = true;
493
          break;
494
        }
495
      }
496

497
      if (!reverse_found) {
×
498
        sample.add(std::move(barcode_1), std::move(barcode_2));
×
499
      }
500
    }
501
  }
502

503
  check_barcodes_sequences(m_samples,
×
504
                           "reversed barcodes",
505
                           config.m_paired_end_mode);
×
506
}
507

508
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc