• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #36

22 Jul 2024 09:33AM UTC coverage: 87.26% (-12.7%) from 100.0%
#36

push

travis-ci

MikkelSchubert
remove duplicate tests

2185 of 2504 relevant lines covered (87.26%)

16293.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.07
/src/alignment.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "alignment.hpp"
21
#include "alignment_tables.hpp" // for DIFFERENT_NTS, IDENTICAL_NTS
22
#include "commontypes.hpp"      // for merge_strategy
23
#include "debug.hpp"            // for AR_REQUIRE
24
#include "fastq.hpp"            // for fastq, fastq_pair_vec
25
#include "simd.hpp"             // for size_t, get_compare_subsequences_func
26
#include <algorithm>            // for max, min
27
#include <array>                // for array
28
#include <string>               // for string, operator+
29
#include <utility>              // for swap, pair
30
#include <vector>               // for vector
31

32
namespace adapterremoval {
33

34
bool
35
sequence_aligner::pairwise_align_sequences(alignment_info& alignment,
123✔
36
                                           const char* seq1,
37
                                           const size_t seq1_len,
38
                                           const char* seq2,
39
                                           const size_t seq2_len,
40
                                           const int min_offset) const
41
{
42
  int offset =
123✔
43
    // The alignment must involve at least one base from seq2,
44
    std::max(std::max(min_offset, -static_cast<int>(seq2_len) + 1),
246✔
45
             // but there's no point aligning pairs too short to matter. This
46
             // currently only applies to --adapter-list mode.
47
             alignment.score() - static_cast<int>(seq2_len));
369✔
48
  int end_offset = static_cast<int>(seq1_len) - std::max(1, alignment.score());
369✔
49

50
  bool alignment_found = false;
123✔
51
  for (; offset <= end_offset; ++offset) {
984✔
52
    size_t initial_seq1_offset;
861✔
53
    size_t initial_seq2_offset;
861✔
54

55
    if (offset < 0) {
861✔
56
      initial_seq1_offset = 0;
96✔
57
      initial_seq2_offset = -offset;
96✔
58
    } else {
59
      initial_seq1_offset = offset;
765✔
60
      initial_seq2_offset = 0;
765✔
61
    }
62

63
    const auto length =
861✔
64
      std::min(seq1_len - initial_seq1_offset, seq2_len - initial_seq2_offset);
1,722✔
65

66
    alignment_info current;
861✔
67
    current.offset = offset;
861✔
68
    current.length = length;
861✔
69

70
    AR_REQUIRE(static_cast<int>(length) >= alignment.score());
1,722✔
71
    if (m_compare_func(current.n_mismatches,
1,722✔
72
                       current.n_ambiguous,
73
                       seq1 + initial_seq1_offset,
74
                       seq2 + initial_seq2_offset,
75
                       length,
76
                       length - alignment.score()) &&
2,583✔
77
        current.is_better_than(alignment)) {
1,040✔
78
      alignment = current;
126✔
79
      alignment_found = true;
126✔
80

81
      // Alignments involving fewer than `score` bases are not interesting
82
      end_offset = static_cast<int>(seq1_len) - alignment.score();
252✔
83
    }
84
  }
85

86
  return alignment_found;
123✔
87
}
88

89
struct phred_scores
90
{
91
  static phred_scores update(char qual_1, char qual_2)
38✔
92
  {
93
    AR_REQUIRE(qual_1 >= qual_2);
38✔
94

95
    const auto phred_1 = static_cast<size_t>(qual_1 - PHRED_OFFSET_MIN);
38✔
96
    const auto phred_2 = static_cast<size_t>(qual_2 - PHRED_OFFSET_MIN);
38✔
97
    const size_t index = (phred_1 * (PHRED_SCORE_MAX + 1)) + phred_2;
38✔
98

99
    return { IDENTICAL_NTS.at(index), DIFFERENT_NTS.at(index) };
114✔
100
  }
101

102
  //! Phred score to assign if the two nucleotides are identical
103
  char identical_nts;
104
  //! Phred score to assign if the two nucleotides differ
105
  char different_nts;
106
};
107

108
///////////////////////////////////////////////////////////////////////////////
109
// Public functions
110

111
bool
112
alignment_info::is_better_than(const alignment_info& other) const
179✔
113
{
114
  if (score() > other.score()) {
537✔
115
    return true;
116
  } else if (score() == other.score()) {
213✔
117
    if (length > other.length) {
69✔
118
      return true;
119
    } else if (length == other.length) {
51✔
120
      return n_ambiguous < other.n_ambiguous;
45✔
121
    }
122
  }
123

124
  return false;
125
}
126

127
void
128
alignment_info::truncate_single_end(fastq& read) const
48✔
129
{
130
  // Given a shift, the alignment of the adapter may start one or more
131
  // bases before the start of the sequence, leading to a negative offset
132
  const size_t len = std::max<int>(0, offset);
96✔
133

134
  return read.truncate(0, len);
48✔
135
}
136

137
size_t
138
alignment_info::truncate_paired_end(fastq& read1, fastq& read2) const
49✔
139
{
140
  size_t had_adapter = 0;
49✔
141
  const size_t isize = insert_size(read1, read2);
49✔
142

143
  if (offset >= 0) {
48✔
144
    // Read1 can potentially extend past read2, but by definition read2
145
    // cannot extend past read1 when the offset is not negative, so there
146
    // is no need to edit read2.
147
    had_adapter += isize < read1.length();
36✔
148
    read1.truncate(0, isize);
36✔
149
  } else {
150
    had_adapter += isize < read1.length();
12✔
151
    had_adapter += isize < read2.length();
12✔
152

153
    read1.truncate(0, isize);
12✔
154
    read2.truncate(read2.length() - isize);
24✔
155
  }
156

157
  return had_adapter;
48✔
158
}
159

160
size_t
161
alignment_info::insert_size(const fastq& read1, const fastq& read2) const
49✔
162
{
163
  AR_REQUIRE(offset <= static_cast<int>(read1.length()));
102✔
164

165
  return std::max<int>(0, static_cast<int>(read2.length()) + offset);
144✔
166
}
167

168
////////////////////////////////////////////////////////////////////////////////
169
// Implementations for `sequence_aligner`
170

171
sequence_aligner::sequence_aligner(const fastq_pair_vec& adapters,
114✔
172
                                   simd::instruction_set is)
114✔
173
  : m_adapters(adapters)
114✔
174
  , m_compare_func(simd::get_compare_subsequences_func(is))
114✔
175
  , m_padding(simd::padding(is))
114✔
176
  , m_max_adapter_len_1()
114✔
177
  , m_max_adapter_len_2()
114✔
178
{
179
  for (const auto& it : m_adapters) {
948✔
180
    m_max_adapter_len_1 = std::max(m_max_adapter_len_1, it.first.length());
369✔
181
    m_max_adapter_len_2 = std::max(m_max_adapter_len_2, it.second.length());
369✔
182
  }
183
}
114✔
184

185
alignment_info
186
sequence_aligner::align_single_end(const fastq& read, int max_shift) const
78✔
187
{
188
  int adapter_id = 0;
78✔
189
  alignment_info alignment;
78✔
190

191
  std::string buffer;
78✔
192
  buffer.reserve(m_max_adapter_len_1 + read.length() + 2 * m_padding);
156✔
193

194
  for (const auto& adapter_pair : m_adapters) {
573✔
195
    const auto& adapter = adapter_pair.first.sequence();
174✔
196

197
    buffer.clear();
87✔
198
    buffer += read.sequence();
174✔
199
    buffer.resize(buffer.size() + m_padding, 'N');
174✔
200
    buffer += adapter;
87✔
201
    buffer.resize(buffer.size() + m_padding, 'N');
174✔
202

203
    const char* read_data = buffer.data();
87✔
204
    const char* adapter_data = buffer.data() + read.length() + m_padding;
261✔
205

206
    if (pairwise_align_sequences(alignment,
174✔
207
                                 read_data,
208
                                 read.length(),
87✔
209
                                 adapter_data,
210
                                 adapter.length(),
87✔
211
                                 -max_shift)) {
212
      alignment.adapter_id = adapter_id;
66✔
213
    }
214

215
    ++adapter_id;
87✔
216
  }
217

218
  return alignment;
156✔
219
}
78✔
220

221
alignment_info
222
sequence_aligner::align_paired_end(const fastq& read1,
36✔
223
                                   const fastq& read2,
224
                                   int max_shift) const
225
{
226
  int adapter_id = 0;
36✔
227
  alignment_info alignment;
36✔
228

229
  std::string buffer;
36✔
230
  buffer.reserve(m_max_adapter_len_2 + read1.length() + m_padding +
108✔
231
                 m_max_adapter_len_1 + read2.length() + m_padding);
72✔
232

233
  for (const auto& adapter_pair : m_adapters) {
252✔
234
    const fastq& adapter1 = adapter_pair.first;
36✔
235
    const fastq& adapter2 = adapter_pair.second;
36✔
236

237
    buffer.clear();
36✔
238
    buffer += adapter2.sequence();
72✔
239
    buffer += read1.sequence();
72✔
240
    buffer.append(m_padding, 'N');
36✔
241
    buffer += read2.sequence();
72✔
242
    buffer += adapter1.sequence();
72✔
243
    buffer.append(m_padding, 'N');
36✔
244

245
    const char* sequence1 = buffer.data();
36✔
246
    const size_t sequence1_len = adapter2.length() + read1.length();
72✔
247
    const char* sequence2 = buffer.data() + sequence1_len + m_padding;
72✔
248
    const size_t sequence2_len = adapter1.length() + read2.length();
72✔
249

250
    // Only consider alignments where at least one nucleotide from each read
251
    // is aligned against the other, included shifted alignments to account
252
    // for missing bases at the 5' ends of the reads.
253
    const int min_offset = adapter2.length() - read2.length() - max_shift;
72✔
254

255
    if (pairwise_align_sequences(alignment,
36✔
256
                                 sequence1,
257
                                 sequence1_len,
258
                                 sequence2,
259
                                 sequence2_len,
260
                                 min_offset)) {
261
      alignment.adapter_id = adapter_id;
33✔
262
      // Convert the alignment into an alignment between read 1 & 2 only
263
      alignment.offset -= adapter2.length();
66✔
264
    }
265

266
    adapter_id++;
36✔
267
  }
268

269
  return alignment;
72✔
270
}
36✔
271

272
void
273
strip_mate_info(std::string& header, const char mate_sep)
33✔
274
{
275
  size_t pos = header.find_first_of(' ');
33✔
276
  if (pos == std::string::npos) {
33✔
277
    pos = header.length();
62✔
278
  }
279

280
  if (pos >= 2 && header.at(pos - 2) == mate_sep) {
66✔
281
    const char digit = header.at(pos - 1);
12✔
282

283
    if (digit == '1' || digit == '2') {
6✔
284
      header.erase(pos - 2, 2);
6✔
285
    }
286
  }
287
}
33✔
288

289
sequence_merger::sequence_merger()
35✔
290
  : m_merge_strategy(merge_strategy::conservative)
35✔
291
{
292
}
35✔
293

294
void
295
sequence_merger::set_mate_separator(char sep)
2✔
296
{
297
  m_mate_sep = sep;
2✔
298
}
2✔
299

300
void
301
sequence_merger::set_merge_strategy(merge_strategy strategy)
19✔
302
{
303
  m_merge_strategy = strategy;
19✔
304
}
19✔
305

306
void
307
sequence_merger::set_max_recalculated_score(char max)
×
308
{
309
  m_max_score = max + '!';
×
310
}
311

312
void
313
sequence_merger::set_rng(std::mt19937* rng)
2✔
314
{
315
  m_rng = rng;
2✔
316
}
2✔
317

318
void
319
sequence_merger::merge(const alignment_info& alignment,
35✔
320
                       fastq& read1,
321
                       const fastq& read2)
322
{
323
  AR_REQUIRE(m_merge_strategy != merge_strategy::none);
35✔
324
  AR_REQUIRE((m_merge_strategy == merge_strategy::original) == !!m_rng);
35✔
325

326
  // Gap between the two reads is not allowed
327
  AR_REQUIRE(alignment.offset <= static_cast<int>(read1.length()));
78✔
328

329
  // Offset to the first base overlapping read 2
330
  const auto read_1_offset = static_cast<size_t>(std::max(0, alignment.offset));
66✔
331
  // Offset to the last base overlapping read 1
332
  const size_t read_2_offset =
33✔
333
    static_cast<int>(read1.length()) - std::max(0, alignment.offset);
99✔
334

335
  AR_REQUIRE(read1.length() - read_1_offset == read_2_offset);
66✔
336

337
  // Produce draft by merging r1 and the parts of r2 that extend past r1
338
  read1.m_sequence.append(read2.sequence(), read_2_offset);
66✔
339
  read1.m_qualities.append(read2.qualities(), read_2_offset);
66✔
340
  AR_REQUIRE(read1.m_sequence.length() == read1.m_qualities.length());
99✔
341

342
  // Pick the best bases for the overlapping part of the reads
343
  for (size_t i = 0; i < read_2_offset; ++i) {
171✔
344
    char& nt_1 = read1.m_sequence.at(i + read_1_offset);
276✔
345
    char& qual_1 = read1.m_qualities.at(i + read_1_offset);
276✔
346
    const char nt_2 = read2.sequence().at(i);
414✔
347
    const char qual_2 = read2.qualities().at(i);
414✔
348

349
    if (m_merge_strategy == merge_strategy::conservative) {
138✔
350
      conservative_merge(nt_1, qual_1, nt_2, qual_2);
66✔
351
    } else {
352
      original_merge(nt_1, qual_1, nt_2, qual_2);
72✔
353
    }
354
  }
355

356
  m_reads_merged += 2;
33✔
357
  m_bases_merged += read_2_offset;
33✔
358

359
  // Remove mate number from read, if present
360
  if (m_mate_sep) {
33✔
361
    strip_mate_info(read1.m_header, m_mate_sep);
33✔
362
  }
363
}
33✔
364

365
void
366
sequence_merger::original_merge(char& nt_1,
72✔
367
                                char& qual_1,
368
                                char nt_2,
369
                                char qual_2)
370
{
371
  if (nt_1 == 'N' || nt_2 == 'N') {
72✔
372
    // If one of the bases are N, then we suppose that we just have (at
373
    // most) a single read at that site and choose that.
374
    if (nt_1 == 'N' && nt_2 == 'N') {
33✔
375
      qual_1 = PHRED_OFFSET_MIN;
2✔
376
    } else if (nt_1 == 'N') {
31✔
377
      nt_1 = nt_2;
4✔
378
      qual_1 = qual_2;
4✔
379
    }
380
  } else if (nt_1 != nt_2 && qual_1 == qual_2) {
39✔
381
    if (m_rng) {
3✔
382
      nt_1 = ((*m_rng)() & 1) ? nt_1 : nt_2;
2✔
383

384
      const phred_scores& new_scores = phred_scores::update(qual_1, qual_2);
2✔
385
      qual_1 = new_scores.different_nts;
2✔
386
    } else {
387
      nt_1 = 'N';
1✔
388
      qual_1 = PHRED_OFFSET_MIN;
1✔
389
    }
390

391
    m_mismatches_unresolved++;
3✔
392
  } else {
3✔
393
    // Ensure that nt_1 / qual_1 always contains the preferred nt / score
394
    // This is an assumption of the g_updated_phred_scores cache.
395
    if (qual_1 < qual_2) {
36✔
396
      std::swap(nt_1, nt_2);
8✔
397
      std::swap(qual_1, qual_2);
8✔
398
    }
399

400
    const phred_scores& new_scores = phred_scores::update(qual_1, qual_2);
36✔
401

402
    if (nt_1 == nt_2) {
36✔
403
      qual_1 = new_scores.identical_nts;
32✔
404
    } else {
405
      qual_1 = new_scores.different_nts;
4✔
406
      m_mismatches_resolved++;
4✔
407
    }
408

409
    qual_1 = std::min<char>(m_max_score, qual_1);
72✔
410
  }
411
}
72✔
412

413
void
414
sequence_merger::conservative_merge(char& nt_1,
66✔
415
                                    char& qual_1,
416
                                    const char nt_2,
417
                                    const char qual_2)
418
{
419
  if (nt_2 == 'N' || nt_1 == 'N') {
66✔
420
    if (nt_1 == 'N' && nt_2 == 'N') {
33✔
421
      qual_1 = PHRED_OFFSET_MIN;
2✔
422
    } else if (nt_1 == 'N') {
31✔
423
      nt_1 = nt_2;
4✔
424
      qual_1 = qual_2;
4✔
425
    }
426
  } else if (nt_1 == nt_2) {
33✔
427
    qual_1 = std::max(qual_1, qual_2);
56✔
428
  } else {
429
    if (qual_1 < qual_2) {
5✔
430
      nt_1 = nt_2;
2✔
431
      qual_1 = qual_2 - qual_1 + PHRED_OFFSET_MIN;
2✔
432
      m_mismatches_resolved++;
2✔
433
    } else if (qual_1 > qual_2) {
3✔
434
      qual_1 = qual_1 - qual_2 + PHRED_OFFSET_MIN;
2✔
435
      m_mismatches_resolved++;
2✔
436
    } else {
437
      // No way to reasonably pick a base
438
      nt_1 = 'N';
1✔
439
      qual_1 = PHRED_OFFSET_MIN;
1✔
440
      m_mismatches_unresolved++;
1✔
441
    }
442
  }
443
}
66✔
444

445
bool
446
extract_adapter_sequences(const alignment_info& alignment,
12✔
447
                          fastq& read1,
448
                          fastq& read2)
449
{
450
  AR_REQUIRE(alignment.offset <= static_cast<int>(read1.length()));
24✔
451
  const int template_length =
12✔
452
    std::max(0, static_cast<int>(read2.length()) + alignment.offset);
36✔
453

454
  read1.truncate(std::min<size_t>(read1.length(), template_length));
36✔
455
  read2.truncate(
12✔
456
    0, std::max<int>(0, static_cast<int>(read2.length()) - template_length));
36✔
457

458
  return read1.sequence().length() || read2.sequence().length();
57✔
459
}
460

461
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc