• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #64

16 Mar 2025 04:40PM UTC coverage: 27.111% (-0.04%) from 27.151%
#64

push

travis-ci

MikkelSchubert
add basic regression tests for SAM output

2597 of 9579 relevant lines covered (27.11%)

4267.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

53.9
/src/sequence_sets.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "sequence_sets.hpp" // declarations
21
#include "debug.hpp"         // for AR_REQUIRE
22
#include "errors.hpp"        // for fastq_error
23
#include "linereader.hpp"    // for line_reader
24
#include "sequence.hpp"      // for dna_sequence
25
#include "strutils.hpp"      // for string_vec, indent_lines
26
#include "table_reader.hpp"  // for table_reader
27
#include <algorithm>         // for max, sort, find
28
#include <sstream>           // for operator<<, basic_ostream, ostringstream
29
#include <stdexcept>         // for invalid_argument
30
#include <string_view>       // for string_view
31
#include <utility>           // for pair
32
#include <vector>            // for vector, vector<>::const_iterator
33

34
namespace adapterremoval {
35

36
namespace {
37

38
void
39
validate_sample_name(std::string_view name)
158✔
40
{
41
  if (name.empty()) {
158✔
42
    throw parsing_error("Sample names must not be empty");
×
43
  } else if (name == "unidentified") {
316✔
44
    throw parsing_error("The sample name 'unidentified' is a reserved "
×
45
                        "name, and cannot be used!");
×
46
  }
47

48
  for (const char c : name) {
1,552✔
49
    if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
1,078✔
50
          (c >= '0' && c <= '9') || (c == '_'))) {
130✔
51
      std::ostringstream error;
×
52
      error << "The sample name " << log_escape(name)
×
53
            << " is not a valid sample name; only letters ('a' to 'z' and 'A' "
54
               "to 'Z'), numbers (0 to 9) and underscores (_) are allowed.";
×
55

56
      throw parsing_error(error.str());
×
57
    }
×
58
  }
59
}
158✔
60

61
void
62
validate_barcode_sequence(const dna_sequence& sequence,
153✔
63
                          const size_t expected_length,
64
                          const int mate)
65
{
66
  std::string_view seq = sequence;
153✔
67

68
  if (seq.find('N') != std::string::npos) {
153✔
69
    std::ostringstream error;
×
70
    error << "Degenerate base (N) found in mate " << mate << " barcode "
×
71
          << "sequence " << log_escape(seq) << ". Degenerate bases are not "
×
72
          << "supported for demultiplexing; please remove before continuing!";
×
73

74
    throw parsing_error(error.str());
×
75
  }
×
76

77
  if (sequence.length() != expected_length) {
306✔
78
    std::ostringstream error;
6✔
79
    error << "Inconsistent mate " << mate << " barcode lengths found: Last "
6✔
80
          << "barcode was " << expected_length << " base-pairs long, but "
6✔
81
          << "barcode " << log_escape(seq) << " is " << seq.length() << " "
18✔
82
          << "base-pairs long. Variable length barcodes are not supported";
6✔
83

84
    throw parsing_error(error.str());
24✔
85
  }
6✔
86
}
147✔
87

88
bool
89
check_barcodes_sequences(const std::vector<sample>& samples,
39✔
90
                         const std::string& source,
91
                         bool paired_end = false)
92
{
93
  if (samples.empty()) {
78✔
94
    throw parsing_error("No samples/barcodes provided");
×
95
  }
96

97
  auto mate_1_len = static_cast<size_t>(-1);
39✔
98
  auto mate_2_len = static_cast<size_t>(-1);
39✔
99

100
  std::vector<std::pair<std::string_view, std::string_view>> sequences;
39✔
101
  for (const auto& sample : samples) {
454✔
102
    validate_sample_name(sample.name());
237✔
103

104
    for (const auto& it : sample) {
614✔
105
      if (mate_1_len == static_cast<size_t>(-1)) {
79✔
106
        mate_1_len = it.barcode_1.length();
39✔
107
        mate_2_len = it.barcode_2.length();
39✔
108

109
        if (!mate_1_len) {
39✔
110
          throw parsing_error("Empty barcode 1 sequence for " + sample.name());
×
111
        }
112
      }
113

114
      validate_barcode_sequence(it.barcode_1, mate_1_len, 1);
79✔
115
      validate_barcode_sequence(it.barcode_2, mate_2_len, 2);
74✔
116

117
      if (paired_end) {
73✔
118
        sequences.emplace_back(it.barcode_1, it.barcode_2);
73✔
119
      } else {
120
        sequences.emplace_back(it.barcode_1, dna_sequence{});
×
121
      }
122
    }
123
  }
124

125
  std::sort(sequences.begin(), sequences.end());
99✔
126
  for (size_t i = 1; i < sequences.size(); ++i) {
130✔
127
    if (sequences.at(i - 1) == sequences.at(i)) {
34✔
128
      const auto& it = sequences.at(i);
2✔
129

130
      if (paired_end) {
2✔
131
        std::ostringstream error;
2✔
132
        error << "Duplicate barcode pairs found in " << log_escape(source)
6✔
133
              << " with barcodes " << log_escape(it.first) << " and "
4✔
134
              << log_escape(it.second) << ". please verify correctness of "
4✔
135
              << "the barcode table and remove any duplicate entries!";
8✔
136

137
        throw parsing_error(error.str());
8✔
138
      } else {
2✔
139
        std::ostringstream error;
×
140
        error << "Duplicate mate 1 barcodes found in " << log_escape(source)
×
141
              << ": " << log_escape(it.first) << ". Even if these are "
×
142
              << "associated with different mate 2 barcodes, it is not "
143
                 "possible to distinguish between these in single-end mode!";
×
144

145
        throw parsing_error(error.str());
×
146
      }
×
147
    }
148
  }
149

150
  return true;
62✔
151
}
39✔
152

153
constexpr bool
154
is_ascii_alpha(const char c)
30✔
155
{
156
  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
30✔
157
}
158

159
constexpr bool
160
is_ascii_alphanum(const char c)
14✔
161
{
162
  return is_ascii_alpha(c) || (c >= '0' && c <= '9');
14✔
163
}
164

165
} // namespace
166

167
///////////////////////////////////////////////////////////////////////////////
168
// Implementations for `read_group`
169

170
read_group::read_group()
162✔
171
  : m_header{ "@RG\tID:1" }
324✔
172
  , m_id{ "1" }
324✔
173
{
174
}
162✔
175

176
read_group::read_group(std::string_view value)
18✔
177
  : m_header{ "@RG" }
50✔
178
{
179
  using invalid = std::invalid_argument;
18✔
180

181
  // It's not unreasonable to except users to try to specify a full @RG line
182
  if (starts_with(value, "@RG\t")) {
36✔
183
    value = value.substr(4);
5✔
184
  }
185

186
  if (!value.empty()) {
18✔
187
    for (const auto& field : split_text(value, '\t')) {
119✔
188
      for (const auto c : field) {
437✔
189
        if (c < ' ' || c > '~') {
92✔
190
          throw invalid("only characters in the range ' ' to '~' are allowed");
1✔
191
        }
192
      }
193

194
      if (field.size() < 4) {
34✔
195
        throw invalid("tags must be at least 4 characters long");
1✔
196
      } else if (!is_ascii_alpha(field.at(0))) {
16✔
197
        throw invalid("first character in tag name must be a letter");
2✔
198
      } else if (!is_ascii_alphanum(field.at(1))) {
14✔
199
        throw invalid(
1✔
200
          "second character in tag name must be a letter or number");
2✔
201
      } else if (field.at(2) != ':') {
13✔
202
        throw invalid("third character in tag must be a colon");
1✔
203
      } else if (starts_with(field, "ID:")) {
36✔
204
        if (!m_id.empty()) {
12✔
205
          throw invalid("multiple ID tags found");
1✔
206
        }
207

208
        m_id = field.substr(3);
10✔
209
      }
210
    }
17✔
211
  }
212

213
  // Generic read-group key if the user did not supply one
214
  if (m_id.empty()) {
22✔
215
    m_id = "1";
7✔
216
    m_header += "\tID:1";
7✔
217
  }
218

219
  if (!value.empty()) {
11✔
220
    m_header += "\t";
10✔
221
    m_header += value;
10✔
222
  }
223
}
32✔
224

225
void
226
read_group::set_id(std::string_view id)
4✔
227
{
228
  AR_REQUIRE(!id.empty());
4✔
229
  update_tag("ID", id);
8✔
230
  m_id = id;
4✔
231
}
4✔
232

233
void
234
read_group::update_tag(std::string_view key, std::string_view value)
55✔
235
{
236
  AR_REQUIRE(!key.empty());
55✔
237
  std::string cache;
55✔
238
  cache.push_back('\t');
55✔
239
  cache.append(key);
55✔
240
  cache.push_back(':');
55✔
241

242
  auto index = m_header.find(cache);
55✔
243
  if (index != std::string::npos) {
55✔
244
    if (value.empty()) {
12✔
245
      cache = m_header.substr(0, index);
8✔
246
    } else {
247
      cache = m_header.substr(0, index + cache.size());
24✔
248
      cache.append(value);
8✔
249
    }
250

251
    index = m_header.find('\t', index + 1);
12✔
252
    if (index != std::string::npos) {
12✔
253
      cache.append(m_header.substr(index));
3✔
254
    }
255

256
    m_header = cache;
67✔
257
  } else if (!value.empty()) {
43✔
258
    m_header.append(cache);
43✔
259
    m_header.append(value);
43✔
260
  }
261
}
55✔
262

263
///////////////////////////////////////////////////////////////////////////////
264
// Implementations for 'adapters' class
265

266
adapter_set::adapter_set(std::initializer_list<string_view_pair> args)
117✔
267
{
268
  for (const auto& pair : args) {
360✔
269
    add(std::string{ pair.first }, std::string{ pair.second });
378✔
270
  }
271
}
117✔
272

273
void
274
adapter_set::add(dna_sequence adapter1, dna_sequence adapter2)
126✔
275
{
276
  m_adapters.emplace_back(adapter1, adapter2.reverse_complement());
252✔
277
}
126✔
278

279
void
280
adapter_set::add(std::string adapter1, const std::string adapter2)
126✔
281
{
282
  add(dna_sequence{ adapter1 }, dna_sequence{ adapter2 });
630✔
283
}
126✔
284

285
adapter_set
286
adapter_set::add_barcodes(const dna_sequence& barcode1,
×
287
                          const dna_sequence& barcode2) const
288
{
289
  adapter_set adapters;
×
290
  for (const auto& it : m_adapters) {
×
291
    // Add sequences directly in alignment orientation
292
    adapters.m_adapters.emplace_back(barcode2.reverse_complement() + it.first,
×
293
                                     it.second + barcode1);
×
294
  }
295

296
  return adapters;
×
297
}
×
298

299
void
300
adapter_set::load(const std::string& filename, bool paired_end)
×
301
{
302
  line_reader reader(filename);
×
303
  const auto adapters = table_reader()
×
304
                          .with_comment_char('#')
×
305
                          .with_min_columns(1 + paired_end)
×
306
                          .with_max_columns(2)
×
307
                          .parse(reader);
×
308

309
  for (const auto& row : adapters) {
×
310
    dna_sequence adapter_1{ row.at(0) };
×
311
    dna_sequence adapter_2;
×
312
    if (row.size() > 1) {
×
313
      adapter_2 = dna_sequence{ row.at(1) };
×
314
    }
315

316
    // Convert from read to alignment orientation
317
    m_adapters.emplace_back(adapter_1, adapter_2.reverse_complement());
×
318
  }
319
}
320

321
sequence_pair_vec
322
adapter_set::to_read_orientation() const
×
323
{
324
  sequence_pair_vec adapters;
×
325
  for (const auto& it : m_adapters) {
×
326
    adapters.emplace_back(it.first, it.second.reverse_complement());
×
327
  }
328

329
  return adapters;
×
330
}
×
331

332
///////////////////////////////////////////////////////////////////////////////
333
// Implementations for 'sample' class
334

335
void
336
sample::add(dna_sequence barcode1, dna_sequence barcode2)
118✔
337
{
338
  m_barcodes.emplace_back(std::move(barcode1), std::move(barcode2));
118✔
339
}
118✔
340

341
void
342
sample::add(std::string barcode1, std::string barcode2)
×
343
{
344
  add(dna_sequence(barcode1), dna_sequence(barcode2));
×
345
}
346

347
void
348
sample::set_adapters(const adapter_set& adapters)
×
349
{
350
  for (auto& it : m_barcodes) {
×
351
    it.adapters = adapters.add_barcodes(it.barcode_1, it.barcode_2);
×
352
  }
353
}
354

355
void
356
sample::set_read_group(const read_group& info)
39✔
357
{
358
  for (auto it = m_barcodes.begin(); it != m_barcodes.end(); ++it) {
312✔
359
    it->info = info;
39✔
360
    it->has_read_group = true;
39✔
361

362
    if (!m_name.empty()) {
78✔
363
      it->info.set_sample(m_name);
×
364

365
      if (m_barcodes.size() > 1) {
×
366
        std::string id = m_name;
×
367
        id.push_back('.');
×
368
        id.append(std::to_string((it - m_barcodes.begin()) + 1));
×
369

370
        it->info.set_id(id);
×
371
      } else {
×
372
        it->info.set_id(m_name);
×
373
      }
374
    }
375

376
    if (it->barcode_1.length() || it->barcode_2.length()) {
156✔
377
      std::string barcodes;
×
378
      barcodes.append(it->barcode_1);
×
379
      if (it->barcode_2.length()) {
×
380
        barcodes.push_back('-');
×
381
        barcodes.append(it->barcode_2);
×
382
      }
383

384
      it->info.set_barcodes(barcodes);
×
385
    }
386
  }
387
}
39✔
388

389
///////////////////////////////////////////////////////////////////////////////
390
// Implementations for 'sample_set' class
391

392
sample_set::sample_set()
×
393
  : m_samples{ sample{} }
×
394
  , m_unidentified("", dna_sequence{}, dna_sequence{})
×
395
{
396
  set_unidentified_read_group(m_read_group);
×
397
}
398

399
sample_set::sample_set(std::initializer_list<sample> args)
39✔
400
  : m_samples(args)
86✔
401
  , m_unidentified("", dna_sequence{}, dna_sequence{})
258✔
402
{
403
  set_unidentified_read_group(m_read_group);
78✔
404

405
  std::sort(m_samples.begin(),
78✔
406
            m_samples.end(),
39✔
407
            [](const auto& a, const auto& b) { return a.name() < b.name(); });
320✔
408

409
  std::string_view name;
39✔
410
  for (const auto& sample : m_samples) {
472✔
411
    validate_sample_name(sample.name());
237✔
412
    if (sample.name() == name) {
237✔
413
      throw parsing_error("duplicate sample name: " + sample.name());
×
414
    }
415

416
    name = sample.name();
237✔
417
  }
418

419
  check_barcodes_sequences(m_samples, "initializer_list", true);
86✔
420
}
87✔
421

422
void
423
sample_set::set_adapters(adapter_set adapters)
×
424
{
425
  m_adapters = std::move(adapters);
×
426
  for (auto& sample : m_samples) {
×
427
    sample.set_adapters(m_adapters);
×
428
  }
429

430
  m_unidentified.set_adapters(m_adapters);
×
431
}
432

433
void
434
sample_set::set_read_group(std::string_view value)
×
435
{
436
  m_read_group = read_group(value);
×
437
  for (auto& sample : m_samples) {
×
438
    sample.set_read_group(m_read_group);
×
439
  }
440

441
  set_unidentified_read_group(m_read_group);
×
442
}
443

444
void
445
sample_set::load(const std::string& filename, const barcode_config& config)
×
446
{
447
  line_reader reader(filename);
×
448
  auto barcodes = table_reader()
×
449
                    .with_comment_char('#')
×
450
                    .with_min_columns(2 + !config.m_unidirectional_barcodes)
×
451
                    .with_max_columns(3)
×
452
                    .parse(reader);
×
453

454
  std::sort(barcodes.begin(), barcodes.end(), [](const auto& a, const auto& b) {
×
455
    return a.at(0) < b.at(0);
×
456
  });
457

458
  std::vector<sample> samples{};
×
459
  for (const auto& row : barcodes) {
×
460
    const auto& name = row.at(0);
×
461
    dna_sequence barcode_1{ row.at(1) };
×
462
    dna_sequence barcode_2;
×
463

464
    if (row.size() > 2) {
×
465
      barcode_2 = dna_sequence{ row.at(2) };
×
466
    }
467

468
    if (samples.empty() || samples.back().name() != name) {
×
469
      sample s{ name, barcode_1, barcode_2 };
×
470
      s.set_adapters(m_adapters);
×
471
      s.set_read_group(m_read_group);
×
472

473
      samples.push_back(std::move(s));
×
474
    } else if (config.m_allow_multiple_barcodes) {
×
475
      samples.back().add(barcode_1, barcode_2);
×
476
    } else {
477
      std::ostringstream error;
×
478
      error << "Duplicate sample name " << log_escape(name)
×
479
            << "; combining different barcodes for one sample is not "
480
               "supported. Please ensure that all sample names are unique!";
×
481

482
      throw parsing_error(error.str());
×
483
    }
×
484
  }
485

486
  // Check before adding reversed barcodes, to prevent misleading error messages
487
  check_barcodes_sequences(samples, filename, config.m_paired_end_mode);
×
488

489
  std::swap(m_samples, samples);
×
490
  if (!config.m_unidirectional_barcodes) {
×
491
    add_reversed_barcodes(config);
×
492
  }
493
}
494

495
void
496
sample_set::set_unidentified_read_group(read_group tmpl)
39✔
497
{
498
  // Unidentified reads lack a SM tag, so add a comment instead
499
  tmpl.set_comment("unidentified");
78✔
500
  m_unidentified.set_read_group(tmpl);
39✔
501
}
39✔
502

503
void
504
sample_set::add_reversed_barcodes(const barcode_config& config)
×
505
{
506
  for (auto& sample : m_samples) {
×
507
    // Original number of barcodes
508
    const size_t count = sample.size();
×
509

510
    for (size_t i = 0; i < count; ++i) {
×
511
      const auto& sequences = sample.at(i);
×
512
      auto barcode_1 = sequences.barcode_2.reverse_complement();
×
513
      auto barcode_2 = sequences.barcode_1.reverse_complement();
×
514
      AR_REQUIRE(barcode_1.length() && barcode_2.length());
×
515

516
      bool reverse_found = false;
×
517
      for (const auto& it : sample) {
×
518
        if (it.barcode_1 == barcode_1 && it.barcode_2 == barcode_2) {
×
519
          reverse_found = true;
520
          break;
521
        }
522
      }
523

524
      if (!reverse_found) {
×
525
        sample.add(std::move(barcode_1), std::move(barcode_2));
×
526
      }
527
    }
528
  }
529

530
  check_barcodes_sequences(
×
531
    m_samples, "reversed barcodes", config.m_paired_end_mode);
×
532
}
533

534
} // namespace adapterremoval
49✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc