• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #45

20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%
#45

push

travis-ci

web-flow
attempt to fix coveralls run

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.43
/src/alignment.hpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
4
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
5
 *                                                                       *
6
 * This program is free software: you can redistribute it and/or modify  *
7
 * it under the terms of the GNU General Public License as published by  *
8
 * the Free Software Foundation, either version 3 of the License, or     *
9
 * (at your option) any later version.                                   *
10
 *                                                                       *
11
 * This program is distributed in the hope that it will be useful,       *
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
14
 * GNU General Public License for more details.                          *
15
 *                                                                       *
16
 * You should have received a copy of the GNU General Public License     *
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
18
\*************************************************************************/
19
#pragma once
20

21
#include "fastq_enc.hpp" // for MATE_SEPARATOR
22
#include "simd.hpp"      // for size_t, compare_subsequences_func, instru...
23
#include <string>        // for string
24

25
namespace adapterremoval {
26

27
enum class merge_strategy;
28
class adapter_set;
29
class fastq;
30

31
/**
32
 * Summarizes an alignment.
33
 *
34
 * A single offset value is used to represent the alignment between two
35
 * sequences, with values ranging from -inf to seq1.len() - 1. The alignment
36
 * of the first base in each sequence against each other is defined as having
37
 * the offset 0, with each other offset defined as the relative position of
38
 * seq 2 base 0 to seq 1 base 0:
39
 *
40
 * Seq 1:           aaaaaaaaa
41
 * seq 2:           bbbbbbbbbbb
42
 * Offset: 0
43
 *
44
 * Seq 1:            aaaaaaaaa
45
 * Seq 2: bbbbbbbbbbb
46
 * Offset: -11
47
 *
48
 * Seq 1:           aaaaaaaaa
49
 * Seq 2:                   bbbbbbbbbbb
50
 * Offset: 8
51
 *
52
 * The meaning of the offset is slightly different in SE and PE mode; in SE
53
 * mode seq2 is the adapter sequence, and the offset therefore unambiguously
54
 * shows the starting position of the adapter, regardless of the size of the
55
 * adapter sequence.
56
 *
57
 * In PE mode, while the alignment is between seq1+PCR2 and PCR1+seq2, the
58
 * offset returned is relative to to seq1 and seq2 only. Thus, if '1'
59
 * represents PCR1 and '2' represents PCR2, the following alignment results
60
 * from PE mode (ie. offset = -9 rather than 3):
61
 *
62
 * Seq 1:    22222222222aaaaaaaaa
63
 * Seq 2:      bbbbbbbbbbb1111111111
64
 * Offset: -9
65
 *
66
 * Note that the offset can never be greater than len(read 1) - 1, but can be
67
 * less than -len(seq 2) + 1, if a positive shift is set in PE mode. In PE
68
 * mode, an offset <= -len(seq 2) indicates that no non-adapter sequence was
69
 * found, while an offset <= 0 indicates the same for SE mode. Offsets less
70
 * than -len(seq 2) for PE or less than 0 for SE indicates that bases have been
71
 * skipped during sequencing, and are discoverable if a shift is set:
72
 *
73
 * Read 1:           ...22222222222aaaaaaaaa
74
 * Read 2:             bbbbbbbbbbb1111111111...
75
 * Offset: -12
76
 *
77
 */
78
struct alignment_info
157✔
79
{
80
  /**
81
   * Returns true if this is a better alignment than other.
82
   *
83
   * When selecting among multiple alignments, the follow criteria are used:
84
   * 1. The alignment with the highest score is preferred.
85
   * 2. If score is equal, the longest alignment is preferred.
86
   * 3. If score and length is equal, the alignment with fewest Ns is preferred.
87
   */
88
  bool is_better_than(const alignment_info& other) const;
89

90
  /**
91
   * Truncates a SE read according to the alignment, such that the second read
92
   * used in the alignment (assumed to represent adapter sequence) is excluded
93
   * from the read passed to this function.
94
   */
95
  void truncate_single_end(fastq& read) const;
96

97
  /**
98
   * Truncate a pair of PE reads, such that any adapter sequence inferred from
99
   * the alignment is excluded from both mates.
100
   *
101
   * @return The number of sequences (0 .. 2) which contained adapter sequence.
102
   */
103
  size_t truncate_paired_end(fastq& read1, fastq& read2) const;
104

105
  /** Calculates the insert size given a pair of un-truncated reads. */
106
  size_t insert_size(const fastq& read1, const fastq& read2) const;
107

108
  /** Returns the score used to compare alignments */
109
  inline int score() const
2,834✔
110
  {
111
    return static_cast<int>(length) -
2,834✔
112
           static_cast<int>(n_ambiguous + 2 * n_mismatches);
2,834✔
113
  }
114

115
  //! 0-based ID of best matching adapter or a negative value if not set.
116
  int adapter_id = -1;
117
  //! Zero based id of the adapter which offered the best alignment. Is less
118
  //! than zero if no alignment was found.
119
  int offset = 0;
120
  //! The number of base-pairs included in the alignment. This number
121
  //! includes both bases aligned between the two mates (in PE mode) and the
122
  //! number of bases aligned between mates and adapter sequences.
123
  size_t length = 0;
124
  //! Number of positions in the alignment in which the two sequences were
125
  //! both called (not N) but differed
126
  size_t n_mismatches = 0;
127
  //! Number of positions in the alignment where one or both bases were N.
128
  size_t n_ambiguous = 0;
129
};
130

131
class sequence_aligner
360✔
132
{
133
public:
134
  explicit sequence_aligner(const adapter_set& adapters,
135
                            simd::instruction_set is);
136

137
  /**
138
   * Attempts to align adapters sequences against a SE read.
139
   *
140
   * @param read A read potentially containing adapter sequences
141
   * @param max_shift Allow up to this number of missing bases at the 5' end of
142
   *                  the read, when aligning the adapter.
143
   * @return The best alignment, or a length 0 alignment if not aligned.
144
   *
145
   * The best alignment is selected using alignment_info::is_better_than.
146
   */
147
  alignment_info align_single_end(const fastq& read, int max_shift);
148

149
  /**
150
   * Attempts to align PE mates, along with any adapter pairs.
151
   *
152
   * @param read1 A mate 1 read potentially containing adapter sequences
153
   * @param read2 A mate 2 read potentially containing adapter sequences
154
   * @param max_shift Allow up to this number of missing bases at the 5' end of
155
   *                  both mate reads.
156
   * @return The best alignment, or a length 0 alignment if not aligned.
157
   *
158
   * The alignment is carried out following the concatenation of adapter2 and
159
   * read1, and the concatenation of read2 and adapter1, resulting in this
160
   * alignment:
161
   *
162
   *    adapter2-read1
163
   *    read2-adapter1
164
   *
165
   * Note the returned offset is relative read1, not to adapter2 + read1,
166
   * and can be used to directly infer the alignment between read1 and read2.
167
   */
168
  alignment_info align_paired_end(const fastq& read1,
169
                                  const fastq& read2,
170
                                  int max_shift);
171

172
private:
173
  /**
174
   * Perform pairwise alignment between two sequences.
175
   *
176
   * @param alignment Current best alignment.
177
   * @param seq1 First sequence to align (mate 1).
178
   * @param seq1_len The (unpadded) length of seq1.
179
   * @param seq2 Second sequence to align (mate 2 or adapter).
180
   * @param seq2_len The (unpadded) length of seq2.
181
   * @param min_offset Search for alignments from this offset.
182
   * @param max_offset Search for alignments until and including this offset.
183
   * @return true if a better alignment was found
184
   */
185
  bool pairwise_align_sequences(alignment_info& alignment,
186
                                const char* seq1,
187
                                size_t seq1_len,
188
                                const char* seq2,
189
                                size_t seq2_len,
190
                                int min_offset,
191
                                int max_offset) const;
192

193
  /** Sets the user-facing adapter ID and prioritizes adapter sequences */
194
  void finalize_alignment(alignment_info& alignment, int max_offset);
195

196
  //! SIMD instruction set to use for sequence comparisons
197
  const simd::compare_subsequences_func m_compare_func;
198
  //! Padding required by chosen SIMD instructions
199
  const size_t m_padding;
200
  //! Internal buffer used to combine adapters and reads
201
  std::string m_buffer{};
202

203
  struct adapter_pair
204
  {
205
    //! The original adapter ID
206
    int adapter_id = 0;
207
    //! The number of best alignments that included this adapter
208
    int hits = 0;
209
    //! The forward adapter sequence
210
    std::string_view adapter1{};
211
    //! The reverse adapter sequence
212
    std::string_view adapter2{};
213
  };
214

215
  //! Vector of adapter IDs and number of matching alignments with that adapter
216
  std::vector<adapter_pair> m_adapters{};
217
};
218

219
/**
220
 * Class for merging two sequence fragments into a single sequence, either
221
 * picking the highest quality base and its associated quality score, or
222
 * recalculating the quality score of matching/mismatching bases using the
223
 * original AR methodology.
224
 */
225
class sequence_merger
226
{
227
public:
228
  sequence_merger();
229

230
  /**
231
   * Sets the expected mate separator. This is used to trim mate numbers from
232
   * merged reads.
233
   */
234
  void set_mate_separator(char sep = MATE_SEPARATOR);
235

236
  /** Set the strategy used when merging bases. */
237
  void set_merge_strategy(merge_strategy strategy);
238
  /** Sets the maximum base quality score for recalculated scores. */
239
  void set_max_recalculated_score(char max);
240

241
  /**
242
   * Merges two overlapping, trimmed reads into a single sequence. Bases and
243
   * quality scores are assigned based on the merge_strategy chosen.
244
   * The sequences are assumed to have been trimmed using the given alignment.
245
   * This function will produce undefined results if that is not the case!
246
   */
247
  void merge(const alignment_info& alignment, fastq& read1, const fastq& read2);
248

249
  /* Returns number of reads merged */
250
  size_t reads_merged() const { return m_reads_merged; }
×
251

252
  /* Returns number of bases merged */
253
  size_t bases_merged() const { return m_bases_merged; }
×
254

255
  /* Returns number of mismatches where a higher quality base was selected */
256
  size_t mismatches_resolved() const { return m_mismatches_resolved; }
257

258
  /* Returns number of mismatches where there was no higher quality base */
259
  size_t mismatches_unresolved() const { return m_mismatches_unresolved; }
260

261
private:
262
  //! Mate separator used in read names
263
  char m_mate_sep = MATE_SEPARATOR;
264
  //! Strategy used when merging reads
265
  merge_strategy m_merge_strategy;
266
  //! Maximum score when recalculating qualities in non-conservative mode
267
  char m_max_score = PHRED_OFFSET_MAX;
268

269
  //! Total number of reads merged
270
  size_t m_reads_merged = 0;
271
  //! The  number of bases merged
272
  size_t m_bases_merged = 0;
273
  //! The number of mismatches where a higher quality base was selected
274
  size_t m_mismatches_resolved = 0;
275
  //! The number of mismatches where there was no higher quality base
276
  size_t m_mismatches_unresolved = 0;
277
};
278

279
/**
280
 * Truncates reads such that only adapter sequence remains.
281
 *
282
 * @return True if either or both reads contained adapter sequence.
283
 *
284
 * Reads that do not contain any adapter sequence are completely truncated,
285
 * such no bases remain of the original sequence.
286
 */
287
bool
288
extract_adapter_sequences(const alignment_info& alignment,
289
                          fastq& read1,
290
                          fastq& read2);
291

292
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc