• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #91

13 Apr 2025 11:50AM UTC coverage: 27.932% (+0.8%) from 27.089%
#91

push

travis-ci

web-flow
rework initializer_list constructor for sample_set (#118)

This to avoid having two ways to construct sets, one of which was only
used for testing purposes. By using the same code everywhere, test
coverage is naturally increased

12 of 15 new or added lines in 3 files covered. (80.0%)

3 existing lines in 1 file now uncovered.

2808 of 10053 relevant lines covered (27.93%)

4011.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

67.53
/src/sequence_sets.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
3
// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
4
#include "sequence_sets.hpp" // declarations
5
#include "debug.hpp"         // for AR_REQUIRE
6
#include "errors.hpp"        // for fastq_error
7
#include "linereader.hpp"    // for line_reader
8
#include "sequence.hpp"      // for dna_sequence
9
#include "strutils.hpp"      // for string_vec, indent_lines
10
#include "table_reader.hpp"  // for table_reader
11
#include <algorithm>         // for max, sort, find
12
#include <sstream>           // for operator<<, basic_ostream, ostringstream
13
#include <stdexcept>         // for invalid_argument
14
#include <string_view>       // for string_view
15
#include <utility>           // for pair
16
#include <vector>            // for vector, vector<>::const_iterator
17

18
namespace adapterremoval {
19

20
namespace {
21

22
void
23
validate_sample_name(std::string_view name)
85✔
24
{
25
  if (name.empty()) {
85✔
26
    throw parsing_error("Sample names must not be empty");
×
27
  } else if (name == "unidentified") {
85✔
28
    throw parsing_error("The sample name 'unidentified' is a reserved "
×
29
                        "name, and cannot be used!");
×
30
  }
31

32
  for (const char c : name) {
751✔
33
    if (!is_ascii_letter_or_digit(c) && (c != '_')) {
581✔
34
      std::ostringstream error;
×
35
      error << "The sample name " << log_escape(name)
×
36
            << " is not a valid sample name; only letters ('a' to 'z' and 'A' "
37
               "to 'Z'), numbers (0 to 9) and underscores (_) are allowed.";
×
38

39
      throw parsing_error(error.str());
×
40
    }
×
41
  }
42
}
85✔
43

44
void
45
validate_barcode_sequence(const dna_sequence& sequence,
165✔
46
                          const size_t expected_length,
47
                          const int mate)
48
{
49
  std::string_view seq = sequence;
165✔
50

51
  if (seq.find('N') != std::string::npos) {
165✔
52
    std::ostringstream error;
×
53
    error << "Degenerate base (N) found in mate " << mate << " barcode "
×
54
          << "sequence " << log_escape(seq) << ". Degenerate bases are not "
×
55
          << "supported for demultiplexing; please remove before continuing!";
×
56

57
    throw parsing_error(error.str());
×
58
  }
×
59

60
  if (sequence.length() != expected_length) {
330✔
61
    std::ostringstream error;
6✔
62
    error << "Inconsistent mate " << mate << " barcode lengths found: Last "
6✔
63
          << "barcode was " << expected_length << " base-pairs long, but "
6✔
64
          << "barcode " << log_escape(seq) << " is " << seq.length() << " "
12✔
65
          << "base-pairs long. Variable length barcodes are not supported";
6✔
66

67
    throw parsing_error(error.str());
12✔
68
  }
6✔
69
}
159✔
70

71
bool
72
check_barcodes_sequences(const std::vector<sample>& samples,
42✔
73
                         const std::string& source,
74
                         bool paired_end = false)
75
{
76
  if (samples.empty()) {
42✔
77
    throw parsing_error("No samples/barcodes provided");
×
78
  }
79

80
  auto mate_1_len = static_cast<size_t>(-1);
42✔
81
  auto mate_2_len = static_cast<size_t>(-1);
42✔
82

83
  std::vector<std::pair<std::string_view, std::string_view>> sequences;
42✔
84
  for (const auto& sample : samples) {
205✔
85
    validate_sample_name(sample.name());
170✔
86

87
    for (const auto& it : sample) {
334✔
88
      if (mate_1_len == static_cast<size_t>(-1)) {
85✔
89
        mate_1_len = it.barcode_1.length();
42✔
90
        mate_2_len = it.barcode_2.length();
42✔
91

92
        if (!mate_1_len) {
42✔
93
          throw parsing_error("Empty barcode 1 sequence for " + sample.name());
×
94
        }
95
      }
96

97
      validate_barcode_sequence(it.barcode_1, mate_1_len, 1);
85✔
98
      validate_barcode_sequence(it.barcode_2, mate_2_len, 2);
80✔
99

100
      if (paired_end) {
79✔
101
        sequences.emplace_back(it.barcode_1, it.barcode_2);
21✔
102
      } else {
103
        sequences.emplace_back(it.barcode_1, dna_sequence{});
174✔
104
      }
105
    }
106
  }
107

108
  std::sort(sequences.begin(), sequences.end());
108✔
109
  for (size_t i = 1; i < sequences.size(); ++i) {
69✔
110
    if (sequences.at(i - 1) == sequences.at(i)) {
37✔
111
      const auto& it = sequences.at(i);
4✔
112

113
      if (paired_end) {
4✔
114
        std::ostringstream error;
2✔
115
        error << "Duplicate barcode pairs found in " << log_escape(source)
4✔
116
              << " with barcodes " << log_escape(it.first) << " and "
4✔
117
              << log_escape(it.second) << ". please verify correctness of "
4✔
118
              << "the barcode table and remove any duplicate entries!";
8✔
119

120
        throw parsing_error(error.str());
4✔
121
      } else {
2✔
122
        std::ostringstream error;
2✔
123
        error << "Duplicate mate 1 barcodes found in " << log_escape(source)
4✔
124
              << ": " << log_escape(it.first) << ". Even if these are "
4✔
125
              << "associated with different mate 2 barcodes, it is not "
126
                 "possible to distinguish between these in single-end mode!";
6✔
127

128
        throw parsing_error(error.str());
4✔
129
      }
2✔
130
    }
131
  }
132

133
  return true;
32✔
134
}
42✔
135

136
} // namespace
137

138
///////////////////////////////////////////////////////////////////////////////
139
// Implementations for `read_group`
140

141
read_group::read_group()
174✔
142
  : m_header{ "@RG\tID:1" }
348✔
143
  , m_id{ "1" }
348✔
144
{
145
}
174✔
146

147
read_group::read_group(std::string_view value)
18✔
148
  : m_header{ "@RG" }
50✔
149
{
150
  using invalid = std::invalid_argument;
18✔
151

152
  // It's not unreasonable to except users to try to specify a full @RG line
153
  if (starts_with(value, "@RG\t")) {
18✔
154
    value = value.substr(4);
5✔
155
  }
156

157
  if (!value.empty()) {
18✔
158
    for (const auto& field : split_text(value, '\t')) {
62✔
159
      for (const auto c : field) {
109✔
160
        if (c < ' ' || c > '~') {
92✔
161
          throw invalid("only characters in the range ' ' to '~' are allowed");
1✔
162
        }
163
      }
164

165
      if (field.size() < 4) {
34✔
166
        throw invalid("tags must be at least 4 characters long");
1✔
167
      } else if (!is_ascii_letter(field.at(0))) {
16✔
168
        throw invalid("first character in tag name must be a letter");
2✔
169
      } else if (!is_ascii_letter_or_digit(field.at(1))) {
14✔
170
        throw invalid(
1✔
171
          "second character in tag name must be a letter or number");
1✔
172
      } else if (field.at(2) != ':') {
13✔
173
        throw invalid("third character in tag must be a colon");
1✔
174
      } else if (starts_with(field, "ID:")) {
12✔
175
        if (!m_id.empty()) {
12✔
176
          throw invalid("multiple ID tags found");
1✔
177
        }
178

179
        m_id = field.substr(3);
10✔
180
      }
181
    }
17✔
182
  }
183

184
  // Generic read-group key if the user did not supply one
185
  if (m_id.empty()) {
22✔
186
    m_id = "1";
7✔
187
    m_header += "\tID:1";
7✔
188
  }
189

190
  if (!value.empty()) {
11✔
191
    m_header += "\t";
10✔
192
    m_header += value;
10✔
193
  }
194
}
32✔
195

196
void
197
read_group::set_id(std::string_view id)
69✔
198
{
199
  AR_REQUIRE(!id.empty());
69✔
200
  update_tag("ID", id);
69✔
201
  m_id = id;
69✔
202
}
69✔
203

204
void
205
read_group::update_tag(std::string_view key, std::string_view value)
211✔
206
{
207
  AR_REQUIRE(!key.empty());
211✔
208
  std::string cache;
211✔
209
  cache.push_back('\t');
211✔
210
  cache.append(key);
211✔
211
  cache.push_back(':');
211✔
212

213
  auto index = m_header.find(cache);
211✔
214
  if (index != std::string::npos) {
211✔
215
    if (value.empty()) {
77✔
216
      cache = m_header.substr(0, index);
8✔
217
    } else {
218
      cache = m_header.substr(0, index + cache.size());
219✔
219
      cache.append(value);
73✔
220
    }
221

222
    index = m_header.find('\t', index + 1);
77✔
223
    if (index != std::string::npos) {
77✔
224
      cache.append(m_header.substr(index));
132✔
225
    }
226

227
    m_header = cache;
154✔
228
  } else if (!value.empty()) {
134✔
229
    m_header.append(cache);
134✔
230
    m_header.append(value);
134✔
231
  }
232
}
211✔
233

234
///////////////////////////////////////////////////////////////////////////////
235
// Implementations for 'adapters' class
236

237
adapter_set::adapter_set(std::initializer_list<string_view_pair> args)
117✔
238
{
239
  for (const auto& pair : args) {
243✔
240
    add(std::string{ pair.first }, std::string{ pair.second });
378✔
241
  }
242
}
117✔
243

244
void
245
adapter_set::add(dna_sequence adapter1, dna_sequence adapter2)
126✔
246
{
247
  m_adapters.emplace_back(adapter1, adapter2.reverse_complement());
252✔
248
}
126✔
249

250
void
251
adapter_set::add(std::string adapter1, const std::string adapter2)
126✔
252
{
253
  add(dna_sequence{ adapter1 }, dna_sequence{ adapter2 });
630✔
254
}
126✔
255

256
adapter_set
257
adapter_set::add_barcodes(const dna_sequence& barcode1,
65✔
258
                          const dna_sequence& barcode2) const
259
{
260
  adapter_set adapters;
65✔
261
  for (const auto& it : m_adapters) {
195✔
262
    // Add sequences directly in alignment orientation
263
    adapters.m_adapters.emplace_back(barcode2.reverse_complement() + it.first,
×
264
                                     it.second + barcode1);
×
265
  }
266

267
  return adapters;
65✔
268
}
×
269

270
void
271
adapter_set::load(const std::string& filename, bool paired_end)
×
272
{
273
  line_reader reader(filename);
×
274
  const auto adapters = table_reader()
×
275
                          .with_comment_char('#')
×
276
                          .with_min_columns(1 + paired_end)
×
277
                          .with_max_columns(2)
×
278
                          .parse(reader);
×
279

280
  for (const auto& row : adapters) {
×
281
    dna_sequence adapter_1{ row.at(0) };
×
282
    dna_sequence adapter_2;
×
283
    if (row.size() > 1) {
×
284
      adapter_2 = dna_sequence{ row.at(1) };
×
285
    }
286

287
    // Convert from read to alignment orientation
288
    m_adapters.emplace_back(adapter_1, adapter_2.reverse_complement());
×
289
  }
290
}
291

292
sequence_pair_vec
293
adapter_set::to_read_orientation() const
×
294
{
295
  sequence_pair_vec adapters;
×
296
  for (const auto& it : m_adapters) {
×
297
    adapters.emplace_back(it.first, it.second.reverse_complement());
×
298
  }
299

300
  return adapters;
×
301
}
×
302

303
///////////////////////////////////////////////////////////////////////////////
304
// Implementations for 'sample' class
305

306
void
307
sample::add(dna_sequence barcode1, dna_sequence barcode2)
127✔
308
{
309
  m_barcodes.emplace_back(std::move(barcode1), std::move(barcode2));
127✔
310
}
127✔
311

312
void
313
sample::add(std::string barcode1, std::string barcode2)
×
314
{
315
  add(dna_sequence(barcode1), dna_sequence(barcode2));
×
316
}
317

318
void
319
sample::set_adapters(const adapter_set& adapters)
65✔
320
{
321
  for (auto& it : m_barcodes) {
260✔
322
    it.adapters = adapters.add_barcodes(it.barcode_1, it.barcode_2);
195✔
323
  }
324
}
65✔
325

326
void
327
sample::set_read_group(const read_group& info)
65✔
328
{
329
  for (auto it = m_barcodes.begin(); it != m_barcodes.end(); ++it) {
325✔
330
    it->info = info;
65✔
331
    it->has_read_group = true;
65✔
332

333
    if (!m_name.empty()) {
130✔
334
      it->info.set_sample(m_name);
65✔
335

336
      if (m_barcodes.size() > 1) {
65✔
337
        std::string id = m_name;
×
338
        id.push_back('.');
×
339
        id.append(std::to_string((it - m_barcodes.begin()) + 1));
×
340

341
        it->info.set_id(id);
×
342
      } else {
×
343
        it->info.set_id(m_name);
65✔
344
      }
345
    }
346

347
    if (it->barcode_1.length() || it->barcode_2.length()) {
130✔
348
      std::string barcodes;
65✔
349
      barcodes.append(it->barcode_1);
65✔
350
      if (it->barcode_2.length()) {
130✔
351
        barcodes.push_back('-');
38✔
352
        barcodes.append(it->barcode_2);
38✔
353
      }
354

355
      it->info.set_barcodes(barcodes);
65✔
356
    }
65✔
357
  }
358
}
65✔
359

360
///////////////////////////////////////////////////////////////////////////////
361
// Implementations for 'sample_set' class
362

363
sample_set::sample_set()
×
364
  : m_samples{ sample{} }
×
365
  , m_unidentified("", dna_sequence{}, dna_sequence{})
×
366
{
367
  set_unidentified_read_group(m_read_group);
×
368
}
369

370
sample_set::sample_set(std::initializer_list<std::string_view> lines,
42✔
371
                       barcode_config config)
124✔
372
{
373
  vec_reader reader(lines);
42✔
374
  load(reader, config, "initializer_list");
104✔
375
}
144✔
376

377
void
378
sample_set::set_adapters(adapter_set adapters)
×
379
{
380
  m_adapters = std::move(adapters);
×
381
  for (auto& sample : m_samples) {
×
382
    sample.set_adapters(m_adapters);
×
383
  }
384

385
  m_unidentified.set_adapters(m_adapters);
×
386
}
387

388
void
389
sample_set::set_read_group(std::string_view value)
×
390
{
391
  m_read_group = read_group(value);
×
392
  for (auto& sample : m_samples) {
×
393
    sample.set_read_group(m_read_group);
×
394
  }
395

396
  set_unidentified_read_group(m_read_group);
×
397
}
398

399
void
400
sample_set::load(const std::string& filename, const barcode_config& config)
×
401
{
402
  line_reader reader(filename);
×
NEW
403
  load(reader, config, filename);
×
404
}
405

406
void
407
sample_set::load(line_reader_base& reader,
42✔
408
                 const barcode_config& config,
409
                 const std::string& filename)
410
{
411
  auto barcodes = table_reader()
126✔
412
                    .with_comment_char('#')
84✔
413
                    .with_min_columns(2 + !config.m_unidirectional_barcodes)
42✔
414
                    .with_max_columns(3)
42✔
415
                    .parse(reader);
42✔
416

417
  std::sort(barcodes.begin(), barcodes.end(), [](const auto& a, const auto& b) {
126✔
418
    return a.at(0) < b.at(0);
258✔
419
  });
420

421
  std::vector<sample> samples{};
42✔
422
  for (const auto& row : barcodes) {
211✔
423
    const auto& name = row.at(0);
170✔
424
    dna_sequence barcode_1{ row.at(1) };
255✔
425
    dna_sequence barcode_2;
85✔
426

427
    if (row.size() > 2) {
170✔
428
      barcode_2 = dna_sequence{ row.at(2) };
250✔
429
    }
430

431
    if (samples.empty() || samples.back().name() != name) {
128✔
432
      samples.emplace_back(name, barcode_1, barcode_2);
85✔
433
    } else if (config.m_allow_multiple_barcodes) {
×
434
      samples.back().add(barcode_1, barcode_2);
×
435
    } else {
436
      std::ostringstream error;
×
437
      error << "Duplicate sample name " << log_escape(name)
×
438
            << "; multiple barcodes per samples is not enabled. Either ensure "
439
               "that all sample names are unique or use --multiple-barcodes to "
440
               "map multiple barcodes to a single sample";
×
441

442
      throw parsing_error(error.str());
×
443
    }
×
444
  }
170✔
445

446
  // Check before adding reversed barcodes, to prevent misleading error messages
447
  check_barcodes_sequences(samples, filename, config.m_paired_end_mode);
42✔
448

449
  std::swap(m_samples, samples);
32✔
450
  if (!config.m_unidirectional_barcodes) {
32✔
451
    add_reversed_barcodes(config);
×
452
  }
453

454
  for (auto& s : m_samples) {
161✔
455
    s.set_read_group(m_read_group);
65✔
456
    s.set_adapters(m_adapters);
65✔
457
  }
458
}
84✔
459

460
void
UNCOV
461
sample_set::set_unidentified_read_group(read_group tmpl)
×
462
{
463
  // Unidentified reads lack a SM tag, so add a description instead
UNCOV
464
  tmpl.set_description("unidentified");
×
UNCOV
465
  m_unidentified.set_read_group(tmpl);
×
466
}
467

468
void
469
sample_set::add_reversed_barcodes(const barcode_config& config)
×
470
{
471
  for (auto& sample : m_samples) {
×
472
    // Original number of barcodes
473
    const size_t count = sample.size();
×
474

475
    for (size_t i = 0; i < count; ++i) {
×
476
      const auto& sequences = sample.at(i);
×
477
      auto barcode_1 = sequences.barcode_2.reverse_complement();
×
478
      auto barcode_2 = sequences.barcode_1.reverse_complement();
×
479
      AR_REQUIRE(barcode_1.length() && barcode_2.length());
×
480

481
      bool reverse_found = false;
×
482
      for (const auto& it : sample) {
×
483
        if (it.barcode_1 == barcode_1 && it.barcode_2 == barcode_2) {
×
484
          reverse_found = true;
485
          break;
486
        }
487
      }
488

489
      if (!reverse_found) {
×
490
        sample.add(std::move(barcode_1), std::move(barcode_2));
×
491
      }
492
    }
493
  }
494

495
  check_barcodes_sequences(m_samples,
×
496
                           "reversed barcodes",
497
                           config.m_paired_end_mode);
×
498
}
499

500
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc