• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #46

27 Nov 2024 03:10PM UTC coverage: 27.245% (+1.0%) from 26.244%
#46

push

travis-ci

MikkelSchubert
fix convenience executable make target

2609 of 9576 relevant lines covered (27.25%)

4268.73 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

53.26
/src/sequence_sets.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "sequence_sets.hpp" // declarations
21
#include "debug.hpp"         // for AR_REQUIRE
22
#include "errors.hpp"        // for fastq_error
23
#include "linereader.hpp"    // for line_reader
24
#include "sequence.hpp"      // for dna_sequence
25
#include "strutils.hpp"      // for string_vec, indent_lines
26
#include "table_reader.hpp"  // for table_reader
27
#include <algorithm>         // for max, sort, find
28
#include <sstream>           // for operator<<, basic_ostream, ostringstream
29
#include <stdexcept>         // for invalid_argument
30
#include <string_view>       // for string_view
31
#include <utility>           // for pair
32
#include <vector>            // for vector, vector<>::const_iterator
33

34
namespace adapterremoval {
35

36
namespace {
37

38
void
39
validate_sample_name(std::string_view name)
158✔
40
{
41
  if (name.empty()) {
158✔
42
    throw parsing_error("Sample names must not be empty");
×
43
  } else if (name == "unidentified") {
316✔
44
    throw parsing_error("The sample name 'unidentified' is a reserved "
×
45
                        "name, and cannot be used!");
×
46
  }
47

48
  for (const char c : name) {
1,552✔
49
    if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
1,078✔
50
          (c >= '0' && c <= '9') || (c == '_'))) {
130✔
51
      std::ostringstream error;
×
52
      error << "The sample name " << log_escape(name)
×
53
            << " is not a valid sample name; only letters ('a' to 'z' and 'A' "
54
               "to 'Z'), numbers (0 to 9) and underscores (_) are allowed.";
×
55

56
      throw parsing_error(error.str());
×
57
    }
×
58
  }
59
}
158✔
60

61
void
62
validate_barcode_sequence(const dna_sequence& sequence,
153✔
63
                          const size_t expected_length,
64
                          const int mate)
65
{
66
  std::string_view seq = sequence;
153✔
67

68
  if (seq.find('N') != std::string::npos) {
153✔
69
    std::ostringstream error;
×
70
    error << "Degenerate base (N) found in mate " << mate << " barcode "
×
71
          << "sequence " << log_escape(seq) << ". Degenerate bases are not "
×
72
          << "supported for demultiplexing; please remove before continuing!";
×
73

74
    throw parsing_error(error.str());
×
75
  }
×
76

77
  if (sequence.length() != expected_length) {
306✔
78
    std::ostringstream error;
6✔
79
    error << "Inconsistent mate " << mate << " barcode lengths found: Last "
6✔
80
          << "barcode was " << expected_length << " base-pairs long, but "
6✔
81
          << "barcode " << log_escape(seq) << " is " << seq.length() << " "
18✔
82
          << "base-pairs long. Variable length barcodes are not supported";
6✔
83

84
    throw parsing_error(error.str());
24✔
85
  }
6✔
86
}
147✔
87

88
bool
89
check_barcodes_sequences(const std::vector<sample>& samples,
39✔
90
                         const std::string& source,
91
                         bool paired_end = false)
92
{
93
  if (samples.empty()) {
78✔
94
    throw parsing_error("No samples/barcodes provided");
×
95
  }
96

97
  auto mate_1_len = static_cast<size_t>(-1);
39✔
98
  auto mate_2_len = static_cast<size_t>(-1);
39✔
99

100
  std::vector<std::pair<std::string_view, std::string_view>> sequences;
39✔
101
  for (const auto& sample : samples) {
454✔
102
    validate_sample_name(sample.name());
237✔
103

104
    for (const auto& it : sample) {
614✔
105
      if (mate_1_len == static_cast<size_t>(-1)) {
79✔
106
        mate_1_len = it.barcode_1.length();
39✔
107
        mate_2_len = it.barcode_2.length();
39✔
108

109
        if (!mate_1_len) {
39✔
110
          throw parsing_error("Empty barcode 1 sequence for " + sample.name());
×
111
        }
112
      }
113

114
      validate_barcode_sequence(it.barcode_1, mate_1_len, 1);
79✔
115
      validate_barcode_sequence(it.barcode_2, mate_2_len, 2);
74✔
116

117
      if (paired_end) {
73✔
118
        sequences.emplace_back(it.barcode_1, it.barcode_2);
73✔
119
      } else {
120
        sequences.emplace_back(it.barcode_1, dna_sequence{});
×
121
      }
122
    }
123
  }
124

125
  std::sort(sequences.begin(), sequences.end());
99✔
126
  for (size_t i = 1; i < sequences.size(); ++i) {
130✔
127
    if (sequences.at(i - 1) == sequences.at(i)) {
34✔
128
      const auto& it = sequences.at(i);
2✔
129

130
      if (paired_end) {
2✔
131
        std::ostringstream error;
2✔
132
        error << "Duplicate barcode pairs found in " << log_escape(source)
6✔
133
              << " with barcodes " << log_escape(it.first) << " and "
4✔
134
              << log_escape(it.second) << ". please verify correctness of "
4✔
135
              << "the barcode table and remove any duplicate entries!";
8✔
136

137
        throw parsing_error(error.str());
8✔
138
      } else {
2✔
139
        std::ostringstream error;
×
140
        error << "Duplicate mate 1 barcodes found in " << log_escape(source)
×
141
              << ": " << log_escape(it.first) << ". Even if these are "
×
142
              << "associated with different mate 2 barcodes, it is not "
143
                 "possible to distinguish between these in single-end mode!";
×
144

145
        throw parsing_error(error.str());
×
146
      }
×
147
    }
148
  }
149

150
  return true;
62✔
151
}
39✔
152

153
/** Unescapes the escape sequences "\\" and "\t" in a read-group string */
154
std::string
155
unescape_read_group(std::string_view value)
26✔
156
{
157
  std::string result;
26✔
158

159
  bool in_escape = false;
26✔
160
  for (auto c : value) {
277✔
161
    if (in_escape) {
200✔
162
      if (c == '\\') {
7✔
163
        result.push_back('\\');
1✔
164
      } else if (c == 't') {
6✔
165
        result.push_back('\t');
5✔
166
      } else {
167
        throw std::invalid_argument("invalid escape sequence " +
3✔
168
                                    std::string("'\\") + c + '\'');
5✔
169
      }
170

171
      in_escape = false;
172
    } else if (c == '\\') {
193✔
173
      in_escape = true;
174
    } else {
175
      result.push_back(c);
185✔
176
    }
177
  }
178

179
  if (in_escape) {
25✔
180
    throw std::invalid_argument("incomplete escape sequence at end of string");
1✔
181
  }
182

183
  return result;
24✔
184
}
2✔
185

186
constexpr bool
187
is_ascii_alpha(const char c)
42✔
188
{
189
  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
42✔
190
}
191

192
constexpr bool
193
is_ascii_alphanum(const char c)
20✔
194
{
195
  return is_ascii_alpha(c) || (c >= '0' && c <= '9');
20✔
196
}
197

198
} // namespace
199

200
///////////////////////////////////////////////////////////////////////////////
201
// Implementations for `read_group`
202

203
read_group::read_group()
160✔
204
  : m_header{ "@RG\tID:1\tPG:adapterremoval" }
320✔
205
  , m_id{ "1" }
320✔
206
{
207
}
160✔
208

209
read_group::read_group(std::string_view value_)
26✔
210
  : m_header{ "@RG" }
70✔
211
{
212
  using invalid = std::invalid_argument;
26✔
213
  auto value = unescape_read_group(value_);
26✔
214

215
  // It's not unreasonable to except users to try to specify a full @RG line
216
  if (starts_with(value, "@RG\t")) {
72✔
217
    value = value.substr(4);
20✔
218
  }
219

220
  bool has_pg_tag = false;
24✔
221
  if (!value.empty()) {
48✔
222
    for (const auto& field : split_text(value, '\t')) {
190✔
223
      for (const auto c : field) {
621✔
224
        if (c < ' ' || c > '~') {
132✔
225
          throw invalid("only characters in the range ' ' to '~' are allowed");
1✔
226
        }
227
      }
228

229
      if (field.size() < 4) {
46✔
230
        throw invalid("tags must be at least 4 characters long");
1✔
231
      } else if (!is_ascii_alpha(field.at(0))) {
22✔
232
        throw invalid("first character in tag name must be a letter");
2✔
233
      } else if (!is_ascii_alphanum(field.at(1))) {
20✔
234
        throw invalid(
1✔
235
          "second character in tag name must be a letter or number");
2✔
236
      } else if (field.at(2) != ':') {
19✔
237
        throw invalid("third character in tag must be a colon");
1✔
238
      } else if (starts_with(field, "ID:")) {
54✔
239
        if (!m_id.empty()) {
18✔
240
          throw invalid("multiple ID tags found");
1✔
241
        }
242

243
        m_id = field.substr(3);
16✔
244
      } else if (starts_with(field, "PG:")) {
27✔
245
        // Allow the user to override the PG field, just in case
246
        has_pg_tag = true;
3✔
247
      }
248
    }
23✔
249
  }
250

251
  // Generic read-group key if the user did not supply one
252
  if (m_id.empty()) {
34✔
253
    m_id = "1";
10✔
254
    m_header += "\tID:1";
10✔
255
  }
256

257
  if (!value.empty()) {
34✔
258
    m_header += "\t";
16✔
259
    m_header += value;
16✔
260
  }
261

262
  if (!has_pg_tag) {
17✔
263
    m_header.append("\tPG:adapterremoval");
14✔
264
  }
265
}
51✔
266

267
void
268
read_group::update_tag(std::string_view key, std::string_view value)
12✔
269
{
270
  AR_REQUIRE(!key.empty() && !value.empty());
24✔
271
  std::string cache;
12✔
272
  cache.push_back('\t');
12✔
273
  cache.append(key);
12✔
274
  cache.push_back(':');
12✔
275

276
  auto index = m_header.find(cache);
12✔
277
  if (index != std::string::npos) {
12✔
278
    cache = m_header.substr(0, index + cache.size());
36✔
279
    cache.append(value);
12✔
280

281
    index = m_header.find('\t', index + 1);
12✔
282
    if (index != std::string::npos) {
12✔
283
      cache.append(m_header.substr(index));
36✔
284
    }
285

286
    m_header = cache;
12✔
287
  } else {
288
    m_header.append(cache);
×
289
    m_header.append(value);
×
290
  }
291

292
  if (key == "ID") {
24✔
293
    m_id = value;
18✔
294
  }
295
}
12✔
296

297
///////////////////////////////////////////////////////////////////////////////
298
// Implementations for 'adapters' class
299

300
adapter_set::adapter_set(std::initializer_list<string_view_pair> args)
117✔
301
{
302
  for (const auto& pair : args) {
360✔
303
    add(std::string{ pair.first }, std::string{ pair.second });
378✔
304
  }
305
}
117✔
306

307
void
308
adapter_set::add(dna_sequence adapter1, dna_sequence adapter2)
126✔
309
{
310
  m_adapters.emplace_back(adapter1, adapter2.reverse_complement());
252✔
311
}
126✔
312

313
void
314
adapter_set::add(std::string adapter1, const std::string adapter2)
126✔
315
{
316
  add(dna_sequence{ adapter1 }, dna_sequence{ adapter2 });
630✔
317
}
126✔
318

319
adapter_set
320
adapter_set::add_barcodes(const dna_sequence& barcode1,
×
321
                          const dna_sequence& barcode2) const
322
{
323
  adapter_set adapters;
×
324
  for (const auto& it : m_adapters) {
×
325
    // Add sequences directly in alignment orientation
326
    adapters.m_adapters.emplace_back(barcode2.reverse_complement() + it.first,
×
327
                                     it.second + barcode1);
×
328
  }
329

330
  return adapters;
×
331
}
×
332

333
void
334
adapter_set::load(const std::string& filename, bool paired_end)
×
335
{
336
  line_reader reader(filename);
×
337
  const auto adapters = table_reader()
×
338
                          .with_comment_char('#')
×
339
                          .with_min_columns(1 + paired_end)
×
340
                          .with_max_columns(2)
×
341
                          .parse(reader);
×
342

343
  for (const auto& row : adapters) {
×
344
    dna_sequence adapter_1{ row.at(0) };
×
345
    dna_sequence adapter_2;
×
346
    if (row.size() > 1) {
×
347
      adapter_2 = dna_sequence{ row.at(1) };
×
348
    }
349

350
    // Convert from read to alignment orientation
351
    m_adapters.emplace_back(adapter_1, adapter_2.reverse_complement());
×
352
  }
353
}
354

355
sequence_pair_vec
356
adapter_set::to_read_orientation() const
×
357
{
358
  sequence_pair_vec adapters;
×
359
  for (const auto& it : m_adapters) {
×
360
    adapters.emplace_back(it.first, it.second.reverse_complement());
×
361
  }
362

363
  return adapters;
×
364
}
×
365

366
///////////////////////////////////////////////////////////////////////////////
367
// Implementations for 'sample' class
368

369
void
370
sample::add(dna_sequence barcode1, dna_sequence barcode2)
118✔
371
{
372
  m_barcodes.emplace_back(std::move(barcode1), std::move(barcode2));
118✔
373
}
118✔
374

375
void
376
sample::add(std::string barcode1, std::string barcode2)
×
377
{
378
  add(dna_sequence(barcode1), dna_sequence(barcode2));
×
379
}
380

381
void
382
sample::set_adapters(const adapter_set& adapters)
×
383
{
384
  for (auto& it : m_barcodes) {
×
385
    it.adapters = adapters.add_barcodes(it.barcode_1, it.barcode_2);
×
386
  }
387
}
388

389
void
390
sample::set_read_group(const read_group& info)
×
391
{
392
  for (auto it = m_barcodes.begin(); it != m_barcodes.end(); ++it) {
×
393
    it->info = info;
×
394

395
    if (!m_name.empty()) {
×
396
      it->info.set_sample(m_name);
×
397

398
      if (m_barcodes.size() > 1) {
×
399
        std::string id = m_name;
×
400
        id.push_back('.');
×
401
        id.append(std::to_string((it - m_barcodes.begin()) + 1));
×
402

403
        it->info.set_id(id);
×
404
      } else {
×
405
        it->info.set_id(m_name);
×
406
      }
407
    }
408

409
    if (it->barcode_1.length() || it->barcode_2.length()) {
×
410
      std::string barcodes;
×
411
      barcodes.append(it->barcode_1);
×
412
      if (it->barcode_2.length()) {
×
413
        barcodes.push_back('-');
×
414
        barcodes.append(it->barcode_2);
×
415
      }
416

417
      it->info.set_barcodes(barcodes);
×
418
    }
419
  }
420
}
421

422
///////////////////////////////////////////////////////////////////////////////
423
// Implementations for 'sample_set' class
424

425
sample_set::sample_set()
×
426
  : m_samples{ sample{} }
×
427
  , m_unidentified("unidentified", dna_sequence{}, dna_sequence{})
×
428
{
429
}
430

431
sample_set::sample_set(std::initializer_list<sample> args)
39✔
432
  : m_samples(args)
86✔
433
  , m_unidentified("unidentified", dna_sequence{}, dna_sequence{})
258✔
434
{
435
  std::sort(m_samples.begin(),
78✔
436
            m_samples.end(),
39✔
437
            [](const auto& a, const auto& b) { return a.name() < b.name(); });
320✔
438

439
  std::string_view name;
39✔
440
  for (const auto& sample : m_samples) {
472✔
441
    validate_sample_name(sample.name());
237✔
442
    if (sample.name() == name) {
237✔
443
      throw parsing_error("duplicate sample name: " + sample.name());
×
444
    }
445

446
    name = sample.name();
237✔
447
  }
448

449
  check_barcodes_sequences(m_samples, "initializer_list", true);
86✔
450
}
87✔
451

452
void
453
sample_set::set_adapters(adapter_set adapters)
×
454
{
455
  m_adapters = std::move(adapters);
×
456
  for (auto& sample : m_samples) {
×
457
    sample.set_adapters(m_adapters);
×
458
  }
459

460
  m_unidentified.set_adapters(m_adapters);
×
461
}
462

463
void
464
sample_set::set_read_group(std::string_view value)
×
465
{
466
  m_read_group = read_group(value);
×
467
  for (auto& sample : m_samples) {
×
468
    sample.set_read_group(m_read_group);
×
469
  }
470

471
  m_unidentified.set_read_group(m_read_group);
×
472
}
473

474
void
475
sample_set::load(const std::string& filename, const barcode_config& config)
×
476
{
477
  line_reader reader(filename);
×
478
  auto barcodes = table_reader()
×
479
                    .with_comment_char('#')
×
480
                    .with_min_columns(2 + !config.m_unidirectional_barcodes)
×
481
                    .with_max_columns(3)
×
482
                    .parse(reader);
×
483

484
  std::sort(barcodes.begin(), barcodes.end(), [](const auto& a, const auto& b) {
×
485
    return a.at(0) < b.at(0);
×
486
  });
487

488
  std::vector<sample> samples{};
×
489
  for (const auto& row : barcodes) {
×
490
    const auto& name = row.at(0);
×
491
    dna_sequence barcode_1{ row.at(1) };
×
492
    dna_sequence barcode_2;
×
493

494
    if (row.size() > 2) {
×
495
      barcode_2 = dna_sequence{ row.at(2) };
×
496
    }
497

498
    if (samples.empty() || samples.back().name() != name) {
×
499
      sample s{ name, barcode_1, barcode_2 };
×
500
      s.set_adapters(m_adapters);
×
501
      s.set_read_group(m_read_group);
×
502

503
      samples.push_back(std::move(s));
×
504
    } else if (config.m_allow_multiple_barcodes) {
×
505
      samples.back().add(barcode_1, barcode_2);
×
506
    } else {
507
      std::ostringstream error;
×
508
      error << "Duplicate sample name " << log_escape(name)
×
509
            << "; combining different barcodes for one sample is not "
510
               "supported. Please ensure that all sample names are unique!";
×
511

512
      throw parsing_error(error.str());
×
513
    }
×
514
  }
515

516
  // Check before adding reversed barcodes, to prevent misleading error messages
517
  check_barcodes_sequences(samples, filename, config.m_paired_end_mode);
×
518

519
  std::swap(m_samples, samples);
×
520
  if (!config.m_unidirectional_barcodes) {
×
521
    add_reversed_barcodes(config);
×
522
  }
523
}
524

525
void
526
sample_set::add_reversed_barcodes(const barcode_config& config)
×
527
{
528
  for (auto& sample : m_samples) {
×
529
    // Original number of barcodes
530
    const size_t count = sample.size();
×
531

532
    for (size_t i = 0; i < count; ++i) {
×
533
      const auto& sequences = sample.at(i);
×
534
      auto barcode_1 = sequences.barcode_2.reverse_complement();
×
535
      auto barcode_2 = sequences.barcode_1.reverse_complement();
×
536
      AR_REQUIRE(barcode_1.length() && barcode_2.length());
×
537

538
      bool reverse_found = false;
×
539
      for (const auto& it : sample) {
×
540
        if (it.barcode_1 == barcode_1 && it.barcode_2 == barcode_2) {
×
541
          reverse_found = true;
542
          break;
543
        }
544
      }
545

546
      if (!reverse_found) {
×
547
        sample.add(std::move(barcode_1), std::move(barcode_2));
×
548
      }
549
    }
550
  }
551

552
  check_barcodes_sequences(
×
553
    m_samples, "reversed barcodes", config.m_paired_end_mode);
×
554
}
555

556
} // namespace adapterremoval
49✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc