MikkelSchubert/adapterremoval | Build 45 | src/alignment.hpp | Coveralls

1

/*************************************************************************\

2

 * AdapterRemoval - cleaning next-generation sequencing reads            *

3

 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *

4

 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *

5

 *                                                                       *

6

 * This program is free software: you can redistribute it and/or modify  *

7

 * it under the terms of the GNU General Public License as published by  *

8

 * the Free Software Foundation, either version 3 of the License, or     *

9

 * (at your option) any later version.                                   *

10

 *                                                                       *

11

 * This program is distributed in the hope that it will be useful,       *

12

 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *

13

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *

14

 * GNU General Public License for more details.                          *

15

 *                                                                       *

16

 * You should have received a copy of the GNU General Public License     *

17

 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *

18

\*************************************************************************/

19

#pragma once

20

21

#include "fastq_enc.hpp" // for MATE_SEPARATOR

22

#include "simd.hpp"      // for size_t, compare_subsequences_func, instru...

23

#include <string>        // for string

24

25

namespace adapterremoval {

26

27

enum class merge_strategy;

28

class adapter_set;

29

class fastq;

30

31

/**

32

 * Summarizes an alignment.

33

34

 * A single offset value is used to represent the alignment between two

35

 * sequences, with values ranging from -inf to seq1.len() - 1. The alignment

36

 * of the first base in each sequence against each other is defined as having

37

 * the offset 0, with each other offset defined as the relative position of

38

 * seq 2 base 0 to seq 1 base 0:

39

40

 * Seq 1:           aaaaaaaaa

41

 * seq 2:           bbbbbbbbbbb

42

 * Offset: 0

43

44

 * Seq 1:            aaaaaaaaa

45

 * Seq 2: bbbbbbbbbbb

46

 * Offset: -11

47

48

 * Seq 1:           aaaaaaaaa

49

 * Seq 2:                   bbbbbbbbbbb

50

 * Offset: 8

51

52

 * The meaning of the offset is slightly different in SE and PE mode; in SE

53

 * mode seq2 is the adapter sequence, and the offset therefore unambiguously

54

 * shows the starting position of the adapter, regardless of the size of the

55

 * adapter sequence.

56

57

 * In PE mode, while the alignment is between seq1+PCR2 and PCR1+seq2, the

58

 * offset returned is relative to to seq1 and seq2 only. Thus, if '1'

59

 * represents PCR1 and '2' represents PCR2, the following alignment results

60

 * from PE mode (ie. offset = -9 rather than 3):

61

62

 * Seq 1:    22222222222aaaaaaaaa

63

 * Seq 2:      bbbbbbbbbbb1111111111

64

 * Offset: -9

65

66

 * Note that the offset can never be greater than len(read 1) - 1, but can be

67

 * less than -len(seq 2) + 1, if a positive shift is set in PE mode. In PE

68

 * mode, an offset <= -len(seq 2) indicates that no non-adapter sequence was

69

 * found, while an offset <= 0 indicates the same for SE mode. Offsets less

70

 * than -len(seq 2) for PE or less than 0 for SE indicates that bases have been

71

 * skipped during sequencing, and are discoverable if a shift is set:

72

73

 * Read 1:           ...22222222222aaaaaaaaa

74

 * Read 2:             bbbbbbbbbbb1111111111...

75

 * Offset: -12

76

77

*/

79

80

/**

81

   * Returns true if this is a better alignment than other.

82

83

   * When selecting among multiple alignments, the follow criteria are used:

84

   * 1. The alignment with the highest score is preferred.

85

   * 2. If score is equal, the longest alignment is preferred.

86

   * 3. If score and length is equal, the alignment with fewest Ns is preferred.

87

*/

88

  bool is_better_than(const alignment_info& other) const;

89

90

/**

91

   * Truncates a SE read according to the alignment, such that the second read

92

   * used in the alignment (assumed to represent adapter sequence) is excluded

93

   * from the read passed to this function.

94

*/

95

  void truncate_single_end(fastq& read) const;

96

97

/**

98

   * Truncate a pair of PE reads, such that any adapter sequence inferred from

99

   * the alignment is excluded from both mates.

100

101

   * @return The number of sequences (0 .. 2) which contained adapter sequence.

102

*/

103

  size_t truncate_paired_end(fastq& read1, fastq& read2) const;

104

105

  /** Calculates the insert size given a pair of un-truncated reads. */

106

  size_t insert_size(const fastq& read1, const fastq& read2) const;

107

108

  /** Returns the score used to compare alignments */

110

113

114

115

  //! 0-based ID of best matching adapter or a negative value if not set.

116

  int adapter_id = -1;

117

  //! Zero based id of the adapter which offered the best alignment. Is less

118

  //! than zero if no alignment was found.

119

  int offset = 0;

120

  //! The number of base-pairs included in the alignment. This number

121

  //! includes both bases aligned between the two mates (in PE mode) and the

122

  //! number of bases aligned between mates and adapter sequences.

123

  size_t length = 0;

124

  //! Number of positions in the alignment in which the two sequences were

125

  //! both called (not N) but differed

126

  size_t n_mismatches = 0;

127

  //! Number of positions in the alignment where one or both bases were N.

128

  size_t n_ambiguous = 0;

129

};

130

132

133

public:

134

  explicit sequence_aligner(const adapter_set& adapters,

135

                            simd::instruction_set is);

136

137

/**

138

   * Attempts to align adapters sequences against a SE read.

139

140

   * @param read A read potentially containing adapter sequences

141

   * @param max_shift Allow up to this number of missing bases at the 5' end of

142

   *                  the read, when aligning the adapter.

143

   * @return The best alignment, or a length 0 alignment if not aligned.

144

145

   * The best alignment is selected using alignment_info::is_better_than.

146

*/

147

  alignment_info align_single_end(const fastq& read, int max_shift);

148

149

/**

150

   * Attempts to align PE mates, along with any adapter pairs.

151

152

   * @param read1 A mate 1 read potentially containing adapter sequences

153

   * @param read2 A mate 2 read potentially containing adapter sequences

154

   * @param max_shift Allow up to this number of missing bases at the 5' end of

155

   *                  both mate reads.

156

   * @return The best alignment, or a length 0 alignment if not aligned.

157

158

   * The alignment is carried out following the concatenation of adapter2 and

159

   * read1, and the concatenation of read2 and adapter1, resulting in this

160

   * alignment:

161

162

   *    adapter2-read1

163

   *    read2-adapter1

164

165

   * Note the returned offset is relative read1, not to adapter2 + read1,

166

   * and can be used to directly infer the alignment between read1 and read2.

167

*/

168

  alignment_info align_paired_end(const fastq& read1,

169

                                  const fastq& read2,

170

                                  int max_shift);

171

172

private:

173

/**

174

   * Perform pairwise alignment between two sequences.

175

176

   * @param alignment Current best alignment.

177

   * @param seq1 First sequence to align (mate 1).

178

   * @param seq1_len The (unpadded) length of seq1.

179

   * @param seq2 Second sequence to align (mate 2 or adapter).

180

   * @param seq2_len The (unpadded) length of seq2.

181

   * @param min_offset Search for alignments from this offset.

182

   * @param max_offset Search for alignments until and including this offset.

183

   * @return true if a better alignment was found

184

*/

185

  bool pairwise_align_sequences(alignment_info& alignment,

186

                                const char* seq1,

187

                                size_t seq1_len,

188

                                const char* seq2,

189

                                size_t seq2_len,

190

                                int min_offset,

191

                                int max_offset) const;

192

193

  /** Sets the user-facing adapter ID and prioritizes adapter sequences */

194

  void finalize_alignment(alignment_info& alignment, int max_offset);

195

196

  //! SIMD instruction set to use for sequence comparisons

197

  const simd::compare_subsequences_func m_compare_func;

198

  //! Padding required by chosen SIMD instructions

199

  const size_t m_padding;

200

  //! Internal buffer used to combine adapters and reads

201

  std::string m_buffer{};

202

203

  struct adapter_pair

204

205

    //! The original adapter ID

206

    int adapter_id = 0;

207

    //! The number of best alignments that included this adapter

208

    int hits = 0;

209

    //! The forward adapter sequence

210

    std::string_view adapter1{};

211

    //! The reverse adapter sequence

212

    std::string_view adapter2{};

213

};

214

215

  //! Vector of adapter IDs and number of matching alignments with that adapter

216

  std::vector<adapter_pair> m_adapters{};

217

};

218

219

/**

220

 * Class for merging two sequence fragments into a single sequence, either

221

 * picking the highest quality base and its associated quality score, or

222

 * recalculating the quality score of matching/mismatching bases using the

223

 * original AR methodology.

224

*/

225

class sequence_merger

226

227

public:

228

  sequence_merger();

229

230

/**

231

   * Sets the expected mate separator. This is used to trim mate numbers from

232

   * merged reads.

233

*/

234

  void set_mate_separator(char sep = MATE_SEPARATOR);

235

236

  /** Set the strategy used when merging bases. */

237

  void set_merge_strategy(merge_strategy strategy);

238

  /** Sets the maximum base quality score for recalculated scores. */

239

  void set_max_recalculated_score(char max);

240

241

/**

242

   * Merges two overlapping, trimmed reads into a single sequence. Bases and

243

   * quality scores are assigned based on the merge_strategy chosen.

244

   * The sequences are assumed to have been trimmed using the given alignment.

245

   * This function will produce undefined results if that is not the case!

246

*/

247

  void merge(const alignment_info& alignment, fastq& read1, const fastq& read2);

248

249

  /* Returns number of reads merged */

251

252

  /* Returns number of bases merged */

254

255

  /* Returns number of mismatches where a higher quality base was selected */

256

  size_t mismatches_resolved() const { return m_mismatches_resolved; }

257

258

  /* Returns number of mismatches where there was no higher quality base */

259

  size_t mismatches_unresolved() const { return m_mismatches_unresolved; }

260

261

private:

262

  //! Mate separator used in read names

263

  char m_mate_sep = MATE_SEPARATOR;

264

  //! Strategy used when merging reads

265

  merge_strategy m_merge_strategy;

266

  //! Maximum score when recalculating qualities in non-conservative mode

267

  char m_max_score = PHRED_OFFSET_MAX;

268

269

  //! Total number of reads merged

270

  size_t m_reads_merged = 0;

271

  //! The  number of bases merged

272

  size_t m_bases_merged = 0;

273

  //! The number of mismatches where a higher quality base was selected

274

  size_t m_mismatches_resolved = 0;

275

  //! The number of mismatches where there was no higher quality base

276

  size_t m_mismatches_unresolved = 0;

277

};

278

279

/**

280

 * Truncates reads such that only adapter sequence remains.

281

282

 * @return True if either or both reads contained adapter sequence.

283

284

 * Reads that do not contain any adapter sequence are completely truncated,

285

 * such no bases remain of the original sequence.

286

*/

287

bool

288

extract_adapter_sequences(const alignment_info& alignment,

289

                          fastq& read1,

290

                          fastq& read2);

291

292

} // namespace adapterremoval

MikkelSchubert / adapterremoval / #45

Source File
Press 'n' to go to next uncovered line, 'b' for previous

1	/*************************************************************************\
2	* AdapterRemoval - cleaning next-generation sequencing reads *
3	* Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk *
4	* Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com *
5	* *
6	* This program is free software: you can redistribute it and/or modify *
7	* it under the terms of the GNU General Public License as published by *
8	* the Free Software Foundation, either version 3 of the License, or *
9	* (at your option) any later version. *
10	* *
11	* This program is distributed in the hope that it will be useful, *
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14	* GNU General Public License for more details. *
15	* *
16	* You should have received a copy of the GNU General Public License *
17	* along with this program. If not, see <http://www.gnu.org/licenses/>. *
18	\*************************************************************************/
19	#pragma once
20
21	#include "fastq_enc.hpp" // for MATE_SEPARATOR
22	#include "simd.hpp" // for size_t, compare_subsequences_func, instru...
23	#include <string> // for string
24
25	namespace adapterremoval {
26
27	enum class merge_strategy;
28	class adapter_set;
29	class fastq;
30
31	/**
32	* Summarizes an alignment.
33	*
34	* A single offset value is used to represent the alignment between two
35	* sequences, with values ranging from -inf to seq1.len() - 1. The alignment
36	* of the first base in each sequence against each other is defined as having
37	* the offset 0, with each other offset defined as the relative position of
38	* seq 2 base 0 to seq 1 base 0:
39	*
40	* Seq 1: aaaaaaaaa
41	* seq 2: bbbbbbbbbbb
42	* Offset: 0
43	*
44	* Seq 1: aaaaaaaaa
45	* Seq 2: bbbbbbbbbbb
46	* Offset: -11
47	*
48	* Seq 1: aaaaaaaaa
49	* Seq 2: bbbbbbbbbbb
50	* Offset: 8
51	*
52	* The meaning of the offset is slightly different in SE and PE mode; in SE
53	* mode seq2 is the adapter sequence, and the offset therefore unambiguously
54	* shows the starting position of the adapter, regardless of the size of the
55	* adapter sequence.
56	*
57	* In PE mode, while the alignment is between seq1+PCR2 and PCR1+seq2, the
58	* offset returned is relative to to seq1 and seq2 only. Thus, if '1'
59	* represents PCR1 and '2' represents PCR2, the following alignment results
60	* from PE mode (ie. offset = -9 rather than 3):
61	*
62	* Seq 1: 22222222222aaaaaaaaa
63	* Seq 2: bbbbbbbbbbb1111111111
64	* Offset: -9
65	*
66	* Note that the offset can never be greater than len(read 1) - 1, but can be
67	* less than -len(seq 2) + 1, if a positive shift is set in PE mode. In PE
68	* mode, an offset <= -len(seq 2) indicates that no non-adapter sequence was
69	* found, while an offset <= 0 indicates the same for SE mode. Offsets less
70	* than -len(seq 2) for PE or less than 0 for SE indicates that bases have been
71	* skipped during sequencing, and are discoverable if a shift is set:
72	*
73	* Read 1: ...22222222222aaaaaaaaa
74	* Read 2: bbbbbbbbbbb1111111111...
75	* Offset: -12
76	*
77	*/
78	struct alignment_info	157✔
79	{
80	/**
81	* Returns true if this is a better alignment than other.
82	*
83	* When selecting among multiple alignments, the follow criteria are used:
84	* 1. The alignment with the highest score is preferred.
85	* 2. If score is equal, the longest alignment is preferred.
86	* 3. If score and length is equal, the alignment with fewest Ns is preferred.
87	*/
88	bool is_better_than(const alignment_info& other) const;
89
90	/**
91	* Truncates a SE read according to the alignment, such that the second read
92	* used in the alignment (assumed to represent adapter sequence) is excluded
93	* from the read passed to this function.
94	*/
95	void truncate_single_end(fastq& read) const;
96
97	/**
98	* Truncate a pair of PE reads, such that any adapter sequence inferred from
99	* the alignment is excluded from both mates.
100	*
101	* @return The number of sequences (0 .. 2) which contained adapter sequence.
102	*/
103	size_t truncate_paired_end(fastq& read1, fastq& read2) const;
104
105	/** Calculates the insert size given a pair of un-truncated reads. */
106	size_t insert_size(const fastq& read1, const fastq& read2) const;
107
108	/** Returns the score used to compare alignments */
109	inline int score() const	2,834✔
110	{
111	return static_cast<int>(length) -	2,834✔
112	static_cast<int>(n_ambiguous + 2 * n_mismatches);	2,834✔
113	}
114
115	//! 0-based ID of best matching adapter or a negative value if not set.
116	int adapter_id = -1;
117	//! Zero based id of the adapter which offered the best alignment. Is less
118	//! than zero if no alignment was found.
119	int offset = 0;
120	//! The number of base-pairs included in the alignment. This number
121	//! includes both bases aligned between the two mates (in PE mode) and the
122	//! number of bases aligned between mates and adapter sequences.
123	size_t length = 0;
124	//! Number of positions in the alignment in which the two sequences were
125	//! both called (not N) but differed
126	size_t n_mismatches = 0;
127	//! Number of positions in the alignment where one or both bases were N.
128	size_t n_ambiguous = 0;
129	};
130
131	class sequence_aligner	360✔
132	{
133	public:
134	explicit sequence_aligner(const adapter_set& adapters,
135	simd::instruction_set is);
136
137	/**
138	* Attempts to align adapters sequences against a SE read.
139	*
140	* @param read A read potentially containing adapter sequences
141	* @param max_shift Allow up to this number of missing bases at the 5' end of
142	* the read, when aligning the adapter.
143	* @return The best alignment, or a length 0 alignment if not aligned.
144	*
145	* The best alignment is selected using alignment_info::is_better_than.
146	*/
147	alignment_info align_single_end(const fastq& read, int max_shift);
148
149	/**
150	* Attempts to align PE mates, along with any adapter pairs.
151	*
152	* @param read1 A mate 1 read potentially containing adapter sequences
153	* @param read2 A mate 2 read potentially containing adapter sequences
154	* @param max_shift Allow up to this number of missing bases at the 5' end of
155	* both mate reads.
156	* @return The best alignment, or a length 0 alignment if not aligned.
157	*
158	* The alignment is carried out following the concatenation of adapter2 and
159	* read1, and the concatenation of read2 and adapter1, resulting in this
160	* alignment:
161	*
162	* adapter2-read1
163	* read2-adapter1
164	*
165	* Note the returned offset is relative read1, not to adapter2 + read1,
166	* and can be used to directly infer the alignment between read1 and read2.
167	*/
168	alignment_info align_paired_end(const fastq& read1,
169	const fastq& read2,
170	int max_shift);
171
172	private:
173	/**
174	* Perform pairwise alignment between two sequences.
175	*
176	* @param alignment Current best alignment.
177	* @param seq1 First sequence to align (mate 1).
178	* @param seq1_len The (unpadded) length of seq1.
179	* @param seq2 Second sequence to align (mate 2 or adapter).
180	* @param seq2_len The (unpadded) length of seq2.
181	* @param min_offset Search for alignments from this offset.
182	* @param max_offset Search for alignments until and including this offset.
183	* @return true if a better alignment was found
184	*/
185	bool pairwise_align_sequences(alignment_info& alignment,
186	const char* seq1,
187	size_t seq1_len,
188	const char* seq2,
189	size_t seq2_len,
190	int min_offset,
191	int max_offset) const;
192
193	/** Sets the user-facing adapter ID and prioritizes adapter sequences */
194	void finalize_alignment(alignment_info& alignment, int max_offset);
195
196	//! SIMD instruction set to use for sequence comparisons
197	const simd::compare_subsequences_func m_compare_func;
198	//! Padding required by chosen SIMD instructions
199	const size_t m_padding;
200	//! Internal buffer used to combine adapters and reads
201	std::string m_buffer{};
202
203	struct adapter_pair
204	{
205	//! The original adapter ID
206	int adapter_id = 0;
207	//! The number of best alignments that included this adapter
208	int hits = 0;
209	//! The forward adapter sequence
210	std::string_view adapter1{};
211	//! The reverse adapter sequence
212	std::string_view adapter2{};
213	};
214
215	//! Vector of adapter IDs and number of matching alignments with that adapter
216	std::vector<adapter_pair> m_adapters{};
217	};
218
219	/**
220	* Class for merging two sequence fragments into a single sequence, either
221	* picking the highest quality base and its associated quality score, or
222	* recalculating the quality score of matching/mismatching bases using the
223	* original AR methodology.
224	*/
225	class sequence_merger
226	{
227	public:
228	sequence_merger();
229
230	/**
231	* Sets the expected mate separator. This is used to trim mate numbers from
232	* merged reads.
233	*/
234	void set_mate_separator(char sep = MATE_SEPARATOR);
235
236	/** Set the strategy used when merging bases. */
237	void set_merge_strategy(merge_strategy strategy);
238	/** Sets the maximum base quality score for recalculated scores. */
239	void set_max_recalculated_score(char max);
240
241	/**
242	* Merges two overlapping, trimmed reads into a single sequence. Bases and
243	* quality scores are assigned based on the merge_strategy chosen.
244	* The sequences are assumed to have been trimmed using the given alignment.
245	* This function will produce undefined results if that is not the case!
246	*/
247	void merge(const alignment_info& alignment, fastq& read1, const fastq& read2);
248
249	/* Returns number of reads merged */
250	size_t reads_merged() const { return m_reads_merged; }	×
251
252	/* Returns number of bases merged */
253	size_t bases_merged() const { return m_bases_merged; }	×
254
255	/* Returns number of mismatches where a higher quality base was selected */
256	size_t mismatches_resolved() const { return m_mismatches_resolved; }
257
258	/* Returns number of mismatches where there was no higher quality base */
259	size_t mismatches_unresolved() const { return m_mismatches_unresolved; }
260
261	private:
262	//! Mate separator used in read names
263	char m_mate_sep = MATE_SEPARATOR;
264	//! Strategy used when merging reads
265	merge_strategy m_merge_strategy;
266	//! Maximum score when recalculating qualities in non-conservative mode
267	char m_max_score = PHRED_OFFSET_MAX;
268
269	//! Total number of reads merged
270	size_t m_reads_merged = 0;
271	//! The number of bases merged
272	size_t m_bases_merged = 0;
273	//! The number of mismatches where a higher quality base was selected
274	size_t m_mismatches_resolved = 0;
275	//! The number of mismatches where there was no higher quality base
276	size_t m_mismatches_unresolved = 0;
277	};
278
279	/**
280	* Truncates reads such that only adapter sequence remains.
281	*
282	* @return True if either or both reads contained adapter sequence.
283	*
284	* Reads that do not contain any adapter sequence are completely truncated,
285	* such no bases remain of the original sequence.
286	*/
287	bool
288	extract_adapter_sequences(const alignment_info& alignment,
289	fastq& read1,
290	fastq& read2);
291
292	} // namespace adapterremoval

MikkelSchubert / adapterremoval / #45

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous