• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #73

22 Mar 2025 10:19PM UTC coverage: 27.088% (-0.002%) from 27.09%
#73

push

travis-ci

web-flow
updates to formating and licensing headers (#95)

* use SPDX headers for licenses

This reduces verbosity and works around an issue with clang-format where
some formatting would not be applied due to the \***\ headers.

* set AllowAllArgumentsOnNextLine and InsertBraces

This results in more consistent formatting using clang-format

18 of 61 new or added lines in 12 files covered. (29.51%)

343 existing lines in 3 files now uncovered.

2601 of 9602 relevant lines covered (27.09%)

4259.01 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

53.9
/src/sequence_sets.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
3
// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
4
#include "sequence_sets.hpp" // declarations
5
#include "debug.hpp"         // for AR_REQUIRE
6
#include "errors.hpp"        // for fastq_error
7
#include "linereader.hpp"    // for line_reader
8
#include "sequence.hpp"      // for dna_sequence
9
#include "strutils.hpp"      // for string_vec, indent_lines
10
#include "table_reader.hpp"  // for table_reader
11
#include <algorithm>         // for max, sort, find
12
#include <sstream>           // for operator<<, basic_ostream, ostringstream
13
#include <stdexcept>         // for invalid_argument
14
#include <string_view>       // for string_view
15
#include <utility>           // for pair
16
#include <vector>            // for vector, vector<>::const_iterator
17

18
namespace adapterremoval {
19

20
namespace {
21

22
void
23
validate_sample_name(std::string_view name)
158✔
24
{
25
  if (name.empty()) {
158✔
26
    throw parsing_error("Sample names must not be empty");
×
27
  } else if (name == "unidentified") {
316✔
28
    throw parsing_error("The sample name 'unidentified' is a reserved "
×
29
                        "name, and cannot be used!");
×
30
  }
31

32
  for (const char c : name) {
1,552✔
33
    if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
1,078✔
34
          (c >= '0' && c <= '9') || (c == '_'))) {
130✔
35
      std::ostringstream error;
×
36
      error << "The sample name " << log_escape(name)
×
37
            << " is not a valid sample name; only letters ('a' to 'z' and 'A' "
38
               "to 'Z'), numbers (0 to 9) and underscores (_) are allowed.";
×
39

40
      throw parsing_error(error.str());
×
41
    }
×
42
  }
43
}
158✔
44

45
void
46
validate_barcode_sequence(const dna_sequence& sequence,
153✔
47
                          const size_t expected_length,
48
                          const int mate)
49
{
50
  std::string_view seq = sequence;
153✔
51

52
  if (seq.find('N') != std::string::npos) {
153✔
53
    std::ostringstream error;
×
54
    error << "Degenerate base (N) found in mate " << mate << " barcode "
×
55
          << "sequence " << log_escape(seq) << ". Degenerate bases are not "
×
56
          << "supported for demultiplexing; please remove before continuing!";
×
57

58
    throw parsing_error(error.str());
×
59
  }
×
60

61
  if (sequence.length() != expected_length) {
306✔
62
    std::ostringstream error;
6✔
63
    error << "Inconsistent mate " << mate << " barcode lengths found: Last "
6✔
64
          << "barcode was " << expected_length << " base-pairs long, but "
6✔
65
          << "barcode " << log_escape(seq) << " is " << seq.length() << " "
18✔
66
          << "base-pairs long. Variable length barcodes are not supported";
6✔
67

68
    throw parsing_error(error.str());
24✔
69
  }
6✔
70
}
147✔
71

72
bool
73
check_barcodes_sequences(const std::vector<sample>& samples,
39✔
74
                         const std::string& source,
75
                         bool paired_end = false)
76
{
77
  if (samples.empty()) {
78✔
78
    throw parsing_error("No samples/barcodes provided");
×
79
  }
80

81
  auto mate_1_len = static_cast<size_t>(-1);
39✔
82
  auto mate_2_len = static_cast<size_t>(-1);
39✔
83

84
  std::vector<std::pair<std::string_view, std::string_view>> sequences;
39✔
85
  for (const auto& sample : samples) {
454✔
86
    validate_sample_name(sample.name());
237✔
87

88
    for (const auto& it : sample) {
614✔
89
      if (mate_1_len == static_cast<size_t>(-1)) {
79✔
90
        mate_1_len = it.barcode_1.length();
39✔
91
        mate_2_len = it.barcode_2.length();
39✔
92

93
        if (!mate_1_len) {
39✔
94
          throw parsing_error("Empty barcode 1 sequence for " + sample.name());
×
95
        }
96
      }
97

98
      validate_barcode_sequence(it.barcode_1, mate_1_len, 1);
79✔
99
      validate_barcode_sequence(it.barcode_2, mate_2_len, 2);
74✔
100

101
      if (paired_end) {
73✔
102
        sequences.emplace_back(it.barcode_1, it.barcode_2);
73✔
103
      } else {
104
        sequences.emplace_back(it.barcode_1, dna_sequence{});
×
105
      }
106
    }
107
  }
108

109
  std::sort(sequences.begin(), sequences.end());
99✔
110
  for (size_t i = 1; i < sequences.size(); ++i) {
130✔
111
    if (sequences.at(i - 1) == sequences.at(i)) {
34✔
112
      const auto& it = sequences.at(i);
2✔
113

114
      if (paired_end) {
2✔
115
        std::ostringstream error;
2✔
116
        error << "Duplicate barcode pairs found in " << log_escape(source)
6✔
117
              << " with barcodes " << log_escape(it.first) << " and "
4✔
118
              << log_escape(it.second) << ". please verify correctness of "
4✔
119
              << "the barcode table and remove any duplicate entries!";
8✔
120

121
        throw parsing_error(error.str());
8✔
122
      } else {
2✔
123
        std::ostringstream error;
×
124
        error << "Duplicate mate 1 barcodes found in " << log_escape(source)
×
125
              << ": " << log_escape(it.first) << ". Even if these are "
×
126
              << "associated with different mate 2 barcodes, it is not "
127
                 "possible to distinguish between these in single-end mode!";
×
128

129
        throw parsing_error(error.str());
×
130
      }
×
131
    }
132
  }
133

134
  return true;
62✔
135
}
39✔
136

137
constexpr bool
138
is_ascii_alpha(const char c)
30✔
139
{
140
  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
30✔
141
}
142

143
constexpr bool
144
is_ascii_alphanum(const char c)
14✔
145
{
146
  return is_ascii_alpha(c) || (c >= '0' && c <= '9');
14✔
147
}
148

149
} // namespace
150

151
///////////////////////////////////////////////////////////////////////////////
152
// Implementations for `read_group`
153

154
read_group::read_group()
162✔
155
  : m_header{ "@RG\tID:1" }
324✔
156
  , m_id{ "1" }
324✔
157
{
158
}
162✔
159

160
read_group::read_group(std::string_view value)
18✔
161
  : m_header{ "@RG" }
50✔
162
{
163
  using invalid = std::invalid_argument;
18✔
164

165
  // It's not unreasonable to except users to try to specify a full @RG line
166
  if (starts_with(value, "@RG\t")) {
36✔
167
    value = value.substr(4);
5✔
168
  }
169

170
  if (!value.empty()) {
18✔
171
    for (const auto& field : split_text(value, '\t')) {
119✔
172
      for (const auto c : field) {
437✔
173
        if (c < ' ' || c > '~') {
92✔
174
          throw invalid("only characters in the range ' ' to '~' are allowed");
1✔
175
        }
176
      }
177

178
      if (field.size() < 4) {
34✔
179
        throw invalid("tags must be at least 4 characters long");
1✔
180
      } else if (!is_ascii_alpha(field.at(0))) {
16✔
181
        throw invalid("first character in tag name must be a letter");
2✔
182
      } else if (!is_ascii_alphanum(field.at(1))) {
14✔
183
        throw invalid(
1✔
184
          "second character in tag name must be a letter or number");
2✔
185
      } else if (field.at(2) != ':') {
13✔
186
        throw invalid("third character in tag must be a colon");
1✔
187
      } else if (starts_with(field, "ID:")) {
36✔
188
        if (!m_id.empty()) {
12✔
189
          throw invalid("multiple ID tags found");
1✔
190
        }
191

192
        m_id = field.substr(3);
10✔
193
      }
194
    }
17✔
195
  }
196

197
  // Generic read-group key if the user did not supply one
198
  if (m_id.empty()) {
22✔
199
    m_id = "1";
7✔
200
    m_header += "\tID:1";
7✔
201
  }
202

203
  if (!value.empty()) {
11✔
204
    m_header += "\t";
10✔
205
    m_header += value;
10✔
206
  }
207
}
32✔
208

209
void
210
read_group::set_id(std::string_view id)
4✔
211
{
212
  AR_REQUIRE(!id.empty());
4✔
213
  update_tag("ID", id);
8✔
214
  m_id = id;
4✔
215
}
4✔
216

217
void
218
read_group::update_tag(std::string_view key, std::string_view value)
55✔
219
{
220
  AR_REQUIRE(!key.empty());
55✔
221
  std::string cache;
55✔
222
  cache.push_back('\t');
55✔
223
  cache.append(key);
55✔
224
  cache.push_back(':');
55✔
225

226
  auto index = m_header.find(cache);
55✔
227
  if (index != std::string::npos) {
55✔
228
    if (value.empty()) {
12✔
229
      cache = m_header.substr(0, index);
8✔
230
    } else {
231
      cache = m_header.substr(0, index + cache.size());
24✔
232
      cache.append(value);
8✔
233
    }
234

235
    index = m_header.find('\t', index + 1);
12✔
236
    if (index != std::string::npos) {
12✔
237
      cache.append(m_header.substr(index));
3✔
238
    }
239

240
    m_header = cache;
67✔
241
  } else if (!value.empty()) {
43✔
242
    m_header.append(cache);
43✔
243
    m_header.append(value);
43✔
244
  }
245
}
55✔
246

247
///////////////////////////////////////////////////////////////////////////////
248
// Implementations for 'adapters' class
249

250
adapter_set::adapter_set(std::initializer_list<string_view_pair> args)
117✔
251
{
252
  for (const auto& pair : args) {
360✔
253
    add(std::string{ pair.first }, std::string{ pair.second });
378✔
254
  }
255
}
117✔
256

257
void
258
adapter_set::add(dna_sequence adapter1, dna_sequence adapter2)
126✔
259
{
260
  m_adapters.emplace_back(adapter1, adapter2.reverse_complement());
252✔
261
}
126✔
262

263
void
264
adapter_set::add(std::string adapter1, const std::string adapter2)
126✔
265
{
266
  add(dna_sequence{ adapter1 }, dna_sequence{ adapter2 });
630✔
267
}
126✔
268

269
adapter_set
270
adapter_set::add_barcodes(const dna_sequence& barcode1,
×
271
                          const dna_sequence& barcode2) const
272
{
273
  adapter_set adapters;
×
274
  for (const auto& it : m_adapters) {
×
275
    // Add sequences directly in alignment orientation
276
    adapters.m_adapters.emplace_back(barcode2.reverse_complement() + it.first,
×
277
                                     it.second + barcode1);
×
278
  }
279

280
  return adapters;
×
281
}
×
282

283
void
284
adapter_set::load(const std::string& filename, bool paired_end)
×
285
{
286
  line_reader reader(filename);
×
287
  const auto adapters = table_reader()
×
288
                          .with_comment_char('#')
×
289
                          .with_min_columns(1 + paired_end)
×
290
                          .with_max_columns(2)
×
291
                          .parse(reader);
×
292

293
  for (const auto& row : adapters) {
×
294
    dna_sequence adapter_1{ row.at(0) };
×
295
    dna_sequence adapter_2;
×
296
    if (row.size() > 1) {
×
297
      adapter_2 = dna_sequence{ row.at(1) };
×
298
    }
299

300
    // Convert from read to alignment orientation
301
    m_adapters.emplace_back(adapter_1, adapter_2.reverse_complement());
×
302
  }
303
}
304

305
sequence_pair_vec
306
adapter_set::to_read_orientation() const
×
307
{
308
  sequence_pair_vec adapters;
×
309
  for (const auto& it : m_adapters) {
×
310
    adapters.emplace_back(it.first, it.second.reverse_complement());
×
311
  }
312

313
  return adapters;
×
314
}
×
315

316
///////////////////////////////////////////////////////////////////////////////
317
// Implementations for 'sample' class
318

319
void
320
sample::add(dna_sequence barcode1, dna_sequence barcode2)
118✔
321
{
322
  m_barcodes.emplace_back(std::move(barcode1), std::move(barcode2));
118✔
323
}
118✔
324

325
void
326
sample::add(std::string barcode1, std::string barcode2)
×
327
{
328
  add(dna_sequence(barcode1), dna_sequence(barcode2));
×
329
}
330

331
void
332
sample::set_adapters(const adapter_set& adapters)
×
333
{
334
  for (auto& it : m_barcodes) {
×
335
    it.adapters = adapters.add_barcodes(it.barcode_1, it.barcode_2);
×
336
  }
337
}
338

339
void
340
sample::set_read_group(const read_group& info)
39✔
341
{
342
  for (auto it = m_barcodes.begin(); it != m_barcodes.end(); ++it) {
312✔
343
    it->info = info;
39✔
344
    it->has_read_group = true;
39✔
345

346
    if (!m_name.empty()) {
78✔
347
      it->info.set_sample(m_name);
×
348

349
      if (m_barcodes.size() > 1) {
×
350
        std::string id = m_name;
×
351
        id.push_back('.');
×
352
        id.append(std::to_string((it - m_barcodes.begin()) + 1));
×
353

354
        it->info.set_id(id);
×
355
      } else {
×
356
        it->info.set_id(m_name);
×
357
      }
358
    }
359

360
    if (it->barcode_1.length() || it->barcode_2.length()) {
156✔
361
      std::string barcodes;
×
362
      barcodes.append(it->barcode_1);
×
363
      if (it->barcode_2.length()) {
×
364
        barcodes.push_back('-');
×
365
        barcodes.append(it->barcode_2);
×
366
      }
367

368
      it->info.set_barcodes(barcodes);
×
369
    }
370
  }
371
}
39✔
372

373
///////////////////////////////////////////////////////////////////////////////
374
// Implementations for 'sample_set' class
375

376
sample_set::sample_set()
×
377
  : m_samples{ sample{} }
×
378
  , m_unidentified("", dna_sequence{}, dna_sequence{})
×
379
{
380
  set_unidentified_read_group(m_read_group);
×
381
}
382

383
sample_set::sample_set(std::initializer_list<sample> args)
39✔
384
  : m_samples(args)
86✔
385
  , m_unidentified("", dna_sequence{}, dna_sequence{})
258✔
386
{
387
  set_unidentified_read_group(m_read_group);
78✔
388

389
  std::sort(m_samples.begin(),
78✔
390
            m_samples.end(),
39✔
391
            [](const auto& a, const auto& b) { return a.name() < b.name(); });
320✔
392

393
  std::string_view name;
39✔
394
  for (const auto& sample : m_samples) {
472✔
395
    validate_sample_name(sample.name());
237✔
396
    if (sample.name() == name) {
237✔
397
      throw parsing_error("duplicate sample name: " + sample.name());
×
398
    }
399

400
    name = sample.name();
237✔
401
  }
402

403
  check_barcodes_sequences(m_samples, "initializer_list", true);
86✔
404
}
87✔
405

406
void
407
sample_set::set_adapters(adapter_set adapters)
×
408
{
409
  m_adapters = std::move(adapters);
×
410
  for (auto& sample : m_samples) {
×
411
    sample.set_adapters(m_adapters);
×
412
  }
413

414
  m_unidentified.set_adapters(m_adapters);
×
415
}
416

417
void
418
sample_set::set_read_group(std::string_view value)
×
419
{
420
  m_read_group = read_group(value);
×
421
  for (auto& sample : m_samples) {
×
422
    sample.set_read_group(m_read_group);
×
423
  }
424

425
  set_unidentified_read_group(m_read_group);
×
426
}
427

428
void
429
sample_set::load(const std::string& filename, const barcode_config& config)
×
430
{
431
  line_reader reader(filename);
×
432
  auto barcodes = table_reader()
×
433
                    .with_comment_char('#')
×
434
                    .with_min_columns(2 + !config.m_unidirectional_barcodes)
×
435
                    .with_max_columns(3)
×
436
                    .parse(reader);
×
437

438
  std::sort(barcodes.begin(), barcodes.end(), [](const auto& a, const auto& b) {
×
439
    return a.at(0) < b.at(0);
×
440
  });
441

442
  std::vector<sample> samples{};
×
443
  for (const auto& row : barcodes) {
×
444
    const auto& name = row.at(0);
×
445
    dna_sequence barcode_1{ row.at(1) };
×
446
    dna_sequence barcode_2;
×
447

448
    if (row.size() > 2) {
×
449
      barcode_2 = dna_sequence{ row.at(2) };
×
450
    }
451

452
    if (samples.empty() || samples.back().name() != name) {
×
453
      samples.emplace_back(name, barcode_1, barcode_2);
×
454
    } else if (config.m_allow_multiple_barcodes) {
×
455
      samples.back().add(barcode_1, barcode_2);
×
456
    } else {
457
      std::ostringstream error;
×
458
      error << "Duplicate sample name " << log_escape(name)
×
459
            << "; multiple barcodes per samples is not enabled. Either ensure "
460
               "that all sample names are unique or use --multiple-barcodes to "
461
               "map multiple barcodes to a single sample";
×
462

463
      throw parsing_error(error.str());
×
464
    }
×
465
  }
466

467
  // Check before adding reversed barcodes, to prevent misleading error messages
468
  check_barcodes_sequences(samples, filename, config.m_paired_end_mode);
×
469

470
  std::swap(m_samples, samples);
×
471
  if (!config.m_unidirectional_barcodes) {
×
472
    add_reversed_barcodes(config);
×
473
  }
474

475
  for (auto& s : m_samples) {
×
476
    s.set_read_group(m_read_group);
×
477
    s.set_adapters(m_adapters);
×
478
  }
479
}
480

481
void
482
sample_set::set_unidentified_read_group(read_group tmpl)
39✔
483
{
484
  // Unidentified reads lack a SM tag, so add a comment instead
485
  tmpl.set_comment("unidentified");
78✔
486
  m_unidentified.set_read_group(tmpl);
39✔
487
}
39✔
488

489
void
490
sample_set::add_reversed_barcodes(const barcode_config& config)
×
491
{
492
  for (auto& sample : m_samples) {
×
493
    // Original number of barcodes
494
    const size_t count = sample.size();
×
495

496
    for (size_t i = 0; i < count; ++i) {
×
497
      const auto& sequences = sample.at(i);
×
498
      auto barcode_1 = sequences.barcode_2.reverse_complement();
×
499
      auto barcode_2 = sequences.barcode_1.reverse_complement();
×
500
      AR_REQUIRE(barcode_1.length() && barcode_2.length());
×
501

502
      bool reverse_found = false;
×
503
      for (const auto& it : sample) {
×
504
        if (it.barcode_1 == barcode_1 && it.barcode_2 == barcode_2) {
×
505
          reverse_found = true;
506
          break;
507
        }
508
      }
509

510
      if (!reverse_found) {
×
511
        sample.add(std::move(barcode_1), std::move(barcode_2));
×
512
      }
513
    }
514
  }
515

NEW
516
  check_barcodes_sequences(m_samples,
×
517
                           "reversed barcodes",
NEW
518
                           config.m_paired_end_mode);
×
519
}
520

521
} // namespace adapterremoval
49✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc