MikkelSchubert / adapterremoval / #36

Committed 22 Jul 2024 09:33AM UTC coverage: 87.26% (-12.7%) from 100.0%

Build # #36

Build Type

push

travis-ci

Committed by

MikkelSchubert

Commit Message

remove duplicate tests

Run Details

2185 of 2504 relevant lines covered (87.26%)

16293.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.07

/src/alignment.cpp

/*************************************************************************\
 * AdapterRemoval - cleaning next-generation sequencing reads            *
 *                                                                       *
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
 *                                                                       *
 * This program is free software: you can redistribute it and/or modify  *
 * it under the terms of the GNU General Public License as published by  *
 * the Free Software Foundation, either version 3 of the License, or     *
 * (at your option) any later version.                                   *
 *                                                                       *
 * This program is distributed in the hope that it will be useful,       *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 * GNU General Public License for more details.                          *
 *                                                                       *
 * You should have received a copy of the GNU General Public License     *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
\*************************************************************************/
#include "alignment.hpp"
#include "alignment_tables.hpp" // for DIFFERENT_NTS, IDENTICAL_NTS
#include "commontypes.hpp"      // for merge_strategy
#include "debug.hpp"            // for AR_REQUIRE
#include "fastq.hpp"            // for fastq, fastq_pair_vec
#include "simd.hpp"             // for size_t, get_compare_subsequences_func
#include <algorithm>            // for max, min
#include <array>                // for array
#include <string>               // for string, operator+
#include <utility>              // for swap, pair
#include <vector>               // for vector

namespace adapterremoval {

bool
sequence_aligner::pairwise_align_sequences(alignment_info& alignment,
                                           const char* seq1,
                                           const size_t seq1_len,
                                           const char* seq2,
                                           const size_t seq2_len,
                                           const int min_offset) const
{
  int offset =
    // The alignment must involve at least one base from seq2,
    std::max(std::max(min_offset, -static_cast<int>(seq2_len) + 1),
             // but there's no point aligning pairs too short to matter. This
             // currently only applies to --adapter-list mode.
             alignment.score() - static_cast<int>(seq2_len));
  int end_offset = static_cast<int>(seq1_len) - std::max(1, alignment.score());

  bool alignment_found = false;
  for (; offset <= end_offset; ++offset) {
    size_t initial_seq1_offset;
    size_t initial_seq2_offset;

    if (offset < 0) {
      initial_seq1_offset = 0;
      initial_seq2_offset = -offset;
    } else {
      initial_seq1_offset = offset;
      initial_seq2_offset = 0;
    }

    const auto length =
      std::min(seq1_len - initial_seq1_offset, seq2_len - initial_seq2_offset);

    alignment_info current;
    current.offset = offset;
    current.length = length;

    AR_REQUIRE(static_cast<int>(length) >= alignment.score());
    if (m_compare_func(current.n_mismatches,
                       current.n_ambiguous,
                       seq1 + initial_seq1_offset,
                       seq2 + initial_seq2_offset,
                       length,
                       length - alignment.score()) &&
        current.is_better_than(alignment)) {
      alignment = current;
      alignment_found = true;

      // Alignments involving fewer than `score` bases are not interesting
      end_offset = static_cast<int>(seq1_len) - alignment.score();
    }
  }

  return alignment_found;
}

struct phred_scores
{
  static phred_scores update(char qual_1, char qual_2)
  {
    AR_REQUIRE(qual_1 >= qual_2);

    const auto phred_1 = static_cast<size_t>(qual_1 - PHRED_OFFSET_MIN);
    const auto phred_2 = static_cast<size_t>(qual_2 - PHRED_OFFSET_MIN);
    const size_t index = (phred_1 * (PHRED_SCORE_MAX + 1)) + phred_2;

    return { IDENTICAL_NTS.at(index), DIFFERENT_NTS.at(index) };
  }

  //! Phred score to assign if the two nucleotides are identical
  char identical_nts;
  //! Phred score to assign if the two nucleotides differ
  char different_nts;
};

///////////////////////////////////////////////////////////////////////////////
// Public functions

bool
alignment_info::is_better_than(const alignment_info& other) const
{
  if (score() > other.score()) {
    return true;
  } else if (score() == other.score()) {
    if (length > other.length) {
      return true;
    } else if (length == other.length) {
      return n_ambiguous < other.n_ambiguous;
    }
  }

  return false;
}

void
alignment_info::truncate_single_end(fastq& read) const
{
  // Given a shift, the alignment of the adapter may start one or more
  // bases before the start of the sequence, leading to a negative offset
  const size_t len = std::max<int>(0, offset);

  return read.truncate(0, len);
}

size_t
alignment_info::truncate_paired_end(fastq& read1, fastq& read2) const
{
  size_t had_adapter = 0;
  const size_t isize = insert_size(read1, read2);

  if (offset >= 0) {
    // Read1 can potentially extend past read2, but by definition read2
    // cannot extend past read1 when the offset is not negative, so there
    // is no need to edit read2.
    had_adapter += isize < read1.length();
    read1.truncate(0, isize);
  } else {
    had_adapter += isize < read1.length();
    had_adapter += isize < read2.length();

    read1.truncate(0, isize);
    read2.truncate(read2.length() - isize);
  }

  return had_adapter;
}

size_t
alignment_info::insert_size(const fastq& read1, const fastq& read2) const
{
  AR_REQUIRE(offset <= static_cast<int>(read1.length()));

  return std::max<int>(0, static_cast<int>(read2.length()) + offset);
}

////////////////////////////////////////////////////////////////////////////////
// Implementations for `sequence_aligner`

sequence_aligner::sequence_aligner(const fastq_pair_vec& adapters,
                                   simd::instruction_set is)
  : m_adapters(adapters)
  , m_compare_func(simd::get_compare_subsequences_func(is))
  , m_padding(simd::padding(is))
  , m_max_adapter_len_1()
  , m_max_adapter_len_2()
{
  for (const auto& it : m_adapters) {
    m_max_adapter_len_1 = std::max(m_max_adapter_len_1, it.first.length());
    m_max_adapter_len_2 = std::max(m_max_adapter_len_2, it.second.length());
  }
}

alignment_info
sequence_aligner::align_single_end(const fastq& read, int max_shift) const
{
  int adapter_id = 0;
  alignment_info alignment;

  std::string buffer;
  buffer.reserve(m_max_adapter_len_1 + read.length() + 2 * m_padding);

  for (const auto& adapter_pair : m_adapters) {
    const auto& adapter = adapter_pair.first.sequence();

    buffer.clear();
    buffer += read.sequence();
    buffer.resize(buffer.size() + m_padding, 'N');
    buffer += adapter;
    buffer.resize(buffer.size() + m_padding, 'N');

    const char* read_data = buffer.data();
    const char* adapter_data = buffer.data() + read.length() + m_padding;

    if (pairwise_align_sequences(alignment,
                                 read_data,
                                 read.length(),
                                 adapter_data,
                                 adapter.length(),
                                 -max_shift)) {
      alignment.adapter_id = adapter_id;
    }

    ++adapter_id;
  }

  return alignment;
}

alignment_info
sequence_aligner::align_paired_end(const fastq& read1,
                                   const fastq& read2,
                                   int max_shift) const
{
  int adapter_id = 0;
  alignment_info alignment;

  std::string buffer;
  buffer.reserve(m_max_adapter_len_2 + read1.length() + m_padding +
                 m_max_adapter_len_1 + read2.length() + m_padding);

  for (const auto& adapter_pair : m_adapters) {
    const fastq& adapter1 = adapter_pair.first;
    const fastq& adapter2 = adapter_pair.second;

    buffer.clear();
    buffer += adapter2.sequence();
    buffer += read1.sequence();
    buffer.append(m_padding, 'N');
    buffer += read2.sequence();
    buffer += adapter1.sequence();
    buffer.append(m_padding, 'N');

    const char* sequence1 = buffer.data();
    const size_t sequence1_len = adapter2.length() + read1.length();
    const char* sequence2 = buffer.data() + sequence1_len + m_padding;
    const size_t sequence2_len = adapter1.length() + read2.length();

    // Only consider alignments where at least one nucleotide from each read
    // is aligned against the other, included shifted alignments to account
    // for missing bases at the 5' ends of the reads.
    const int min_offset = adapter2.length() - read2.length() - max_shift;

    if (pairwise_align_sequences(alignment,
                                 sequence1,
                                 sequence1_len,
                                 sequence2,
                                 sequence2_len,
                                 min_offset)) {
      alignment.adapter_id = adapter_id;
      // Convert the alignment into an alignment between read 1 & 2 only
      alignment.offset -= adapter2.length();
    }

    adapter_id++;
  }

  return alignment;
}

void
strip_mate_info(std::string& header, const char mate_sep)
{
  size_t pos = header.find_first_of(' ');
  if (pos == std::string::npos) {
    pos = header.length();
  }

  if (pos >= 2 && header.at(pos - 2) == mate_sep) {
    const char digit = header.at(pos - 1);

    if (digit == '1' || digit == '2') {
      header.erase(pos - 2, 2);
    }
  }
}

sequence_merger::sequence_merger()
  : m_merge_strategy(merge_strategy::conservative)
{
}

void
sequence_merger::set_mate_separator(char sep)
{
  m_mate_sep = sep;
}

void
sequence_merger::set_merge_strategy(merge_strategy strategy)
{
  m_merge_strategy = strategy;
}

void
sequence_merger::set_max_recalculated_score(char max)
{
  m_max_score = max + '!';
}

void
sequence_merger::set_rng(std::mt19937* rng)
{
  m_rng = rng;
}

void
sequence_merger::merge(const alignment_info& alignment,
                       fastq& read1,
                       const fastq& read2)
{
  AR_REQUIRE(m_merge_strategy != merge_strategy::none);
  AR_REQUIRE((m_merge_strategy == merge_strategy::original) == !!m_rng);

  // Gap between the two reads is not allowed
  AR_REQUIRE(alignment.offset <= static_cast<int>(read1.length()));

  // Offset to the first base overlapping read 2
  const auto read_1_offset = static_cast<size_t>(std::max(0, alignment.offset));
  // Offset to the last base overlapping read 1
  const size_t read_2_offset =
    static_cast<int>(read1.length()) - std::max(0, alignment.offset);

  AR_REQUIRE(read1.length() - read_1_offset == read_2_offset);

  // Produce draft by merging r1 and the parts of r2 that extend past r1
  read1.m_sequence.append(read2.sequence(), read_2_offset);
  read1.m_qualities.append(read2.qualities(), read_2_offset);
  AR_REQUIRE(read1.m_sequence.length() == read1.m_qualities.length());

  // Pick the best bases for the overlapping part of the reads
  for (size_t i = 0; i < read_2_offset; ++i) {
    char& nt_1 = read1.m_sequence.at(i + read_1_offset);
    char& qual_1 = read1.m_qualities.at(i + read_1_offset);
    const char nt_2 = read2.sequence().at(i);
    const char qual_2 = read2.qualities().at(i);

    if (m_merge_strategy == merge_strategy::conservative) {
      conservative_merge(nt_1, qual_1, nt_2, qual_2);
    } else {
      original_merge(nt_1, qual_1, nt_2, qual_2);
    }
  }

  m_reads_merged += 2;
  m_bases_merged += read_2_offset;

  // Remove mate number from read, if present
  if (m_mate_sep) {
    strip_mate_info(read1.m_header, m_mate_sep);
  }
}

void
sequence_merger::original_merge(char& nt_1,
                                char& qual_1,
                                char nt_2,
                                char qual_2)
{
  if (nt_1 == 'N' || nt_2 == 'N') {
    // If one of the bases are N, then we suppose that we just have (at
    // most) a single read at that site and choose that.
    if (nt_1 == 'N' && nt_2 == 'N') {
      qual_1 = PHRED_OFFSET_MIN;
    } else if (nt_1 == 'N') {
      nt_1 = nt_2;
      qual_1 = qual_2;
    }
  } else if (nt_1 != nt_2 && qual_1 == qual_2) {
    if (m_rng) {
      nt_1 = ((*m_rng)() & 1) ? nt_1 : nt_2;

      const phred_scores& new_scores = phred_scores::update(qual_1, qual_2);
      qual_1 = new_scores.different_nts;
    } else {
      nt_1 = 'N';
      qual_1 = PHRED_OFFSET_MIN;
    }

    m_mismatches_unresolved++;
  } else {
    // Ensure that nt_1 / qual_1 always contains the preferred nt / score
    // This is an assumption of the g_updated_phred_scores cache.
    if (qual_1 < qual_2) {
      std::swap(nt_1, nt_2);
      std::swap(qual_1, qual_2);
    }

    const phred_scores& new_scores = phred_scores::update(qual_1, qual_2);

    if (nt_1 == nt_2) {
      qual_1 = new_scores.identical_nts;
    } else {
      qual_1 = new_scores.different_nts;
      m_mismatches_resolved++;
    }

    qual_1 = std::min<char>(m_max_score, qual_1);
  }
}

void
sequence_merger::conservative_merge(char& nt_1,
                                    char& qual_1,
                                    const char nt_2,
                                    const char qual_2)
{
  if (nt_2 == 'N' || nt_1 == 'N') {
    if (nt_1 == 'N' && nt_2 == 'N') {
      qual_1 = PHRED_OFFSET_MIN;
    } else if (nt_1 == 'N') {
      nt_1 = nt_2;
      qual_1 = qual_2;
    }
  } else if (nt_1 == nt_2) {
    qual_1 = std::max(qual_1, qual_2);
  } else {
    if (qual_1 < qual_2) {
      nt_1 = nt_2;
      qual_1 = qual_2 - qual_1 + PHRED_OFFSET_MIN;
      m_mismatches_resolved++;
    } else if (qual_1 > qual_2) {
      qual_1 = qual_1 - qual_2 + PHRED_OFFSET_MIN;
      m_mismatches_resolved++;
    } else {
      // No way to reasonably pick a base
      nt_1 = 'N';
      qual_1 = PHRED_OFFSET_MIN;
      m_mismatches_unresolved++;
    }
  }
}

bool
extract_adapter_sequences(const alignment_info& alignment,
                          fastq& read1,
                          fastq& read2)
{
  AR_REQUIRE(alignment.offset <= static_cast<int>(read1.length()));
  const int template_length =
    std::max(0, static_cast<int>(read2.length()) + alignment.offset);

  read1.truncate(std::min<size_t>(read1.length(), template_length));
  read2.truncate(
    0, std::max<int>(0, static_cast<int>(read2.length()) - template_length));

  return read1.sequence().length() || read2.sequence().length();
}

} // namespace adapterremoval

1	/*************************************************************************\
2	* AdapterRemoval - cleaning next-generation sequencing reads *
3	* *
4	* Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk *
5	* Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com *
6	* *
7	* This program is free software: you can redistribute it and/or modify *
8	* it under the terms of the GNU General Public License as published by *
9	* the Free Software Foundation, either version 3 of the License, or *
10	* (at your option) any later version. *
11	* *
12	* This program is distributed in the hope that it will be useful, *
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15	* GNU General Public License for more details. *
16	* *
17	* You should have received a copy of the GNU General Public License *
18	* along with this program. If not, see <http://www.gnu.org/licenses/>. *
19	\*************************************************************************/
20	#include "alignment.hpp"
21	#include "alignment_tables.hpp" // for DIFFERENT_NTS, IDENTICAL_NTS
22	#include "commontypes.hpp" // for merge_strategy
23	#include "debug.hpp" // for AR_REQUIRE
24	#include "fastq.hpp" // for fastq, fastq_pair_vec
25	#include "simd.hpp" // for size_t, get_compare_subsequences_func
26	#include <algorithm> // for max, min
27	#include <array> // for array
28	#include <string> // for string, operator+
29	#include <utility> // for swap, pair
30	#include <vector> // for vector
31
32	namespace adapterremoval {
33
34	bool
35	sequence_aligner::pairwise_align_sequences(alignment_info& alignment,	123✔
36	const char* seq1,
37	const size_t seq1_len,
38	const char* seq2,
39	const size_t seq2_len,
40	const int min_offset) const
41	{
42	int offset =	123✔
43	// The alignment must involve at least one base from seq2,
44	std::max(std::max(min_offset, -static_cast<int>(seq2_len) + 1),	246✔
45	// but there's no point aligning pairs too short to matter. This
46	// currently only applies to --adapter-list mode.
47	alignment.score() - static_cast<int>(seq2_len));	369✔
48	int end_offset = static_cast<int>(seq1_len) - std::max(1, alignment.score());	369✔
49
50	bool alignment_found = false;	123✔
51	for (; offset <= end_offset; ++offset) {	984✔
52	size_t initial_seq1_offset;	861✔
53	size_t initial_seq2_offset;	861✔
54
55	if (offset < 0) {	861✔
56	initial_seq1_offset = 0;	96✔
57	initial_seq2_offset = -offset;	96✔
58	} else {
59	initial_seq1_offset = offset;	765✔
60	initial_seq2_offset = 0;	765✔
61	}
62
63	const auto length =	861✔
64	std::min(seq1_len - initial_seq1_offset, seq2_len - initial_seq2_offset);	1,722✔
65
66	alignment_info current;	861✔
67	current.offset = offset;	861✔
68	current.length = length;	861✔
69
70	AR_REQUIRE(static_cast<int>(length) >= alignment.score());	1,722✔
71	if (m_compare_func(current.n_mismatches,	1,722✔
72	current.n_ambiguous,
73	seq1 + initial_seq1_offset,
74	seq2 + initial_seq2_offset,
75	length,
76	length - alignment.score()) &&	2,583✔
77	current.is_better_than(alignment)) {	1,040✔
78	alignment = current;	126✔
79	alignment_found = true;	126✔
80
81	// Alignments involving fewer than `score` bases are not interesting
82	end_offset = static_cast<int>(seq1_len) - alignment.score();	252✔
83	}
84	}
85
86	return alignment_found;	123✔
87	}
88
89	struct phred_scores
90	{
91	static phred_scores update(char qual_1, char qual_2)	38✔
92	{
93	AR_REQUIRE(qual_1 >= qual_2);	38✔
94
95	const auto phred_1 = static_cast<size_t>(qual_1 - PHRED_OFFSET_MIN);	38✔
96	const auto phred_2 = static_cast<size_t>(qual_2 - PHRED_OFFSET_MIN);	38✔
97	const size_t index = (phred_1 * (PHRED_SCORE_MAX + 1)) + phred_2;	38✔
98
99	return { IDENTICAL_NTS.at(index), DIFFERENT_NTS.at(index) };	114✔
100	}
101
102	//! Phred score to assign if the two nucleotides are identical
103	char identical_nts;
104	//! Phred score to assign if the two nucleotides differ
105	char different_nts;
106	};
107
108	///////////////////////////////////////////////////////////////////////////////
109	// Public functions
110
111	bool
112	alignment_info::is_better_than(const alignment_info& other) const	179✔
113	{
114	if (score() > other.score()) {	537✔
115	return true;
116	} else if (score() == other.score()) {	213✔
117	if (length > other.length) {	69✔
118	return true;
119	} else if (length == other.length) {	51✔
120	return n_ambiguous < other.n_ambiguous;	45✔
121	}
122	}
123
124	return false;
125	}
126
127	void
128	alignment_info::truncate_single_end(fastq& read) const	48✔
129	{
130	// Given a shift, the alignment of the adapter may start one or more
131	// bases before the start of the sequence, leading to a negative offset
132	const size_t len = std::max<int>(0, offset);	96✔
133
134	return read.truncate(0, len);	48✔
135	}
136
137	size_t
138	alignment_info::truncate_paired_end(fastq& read1, fastq& read2) const	49✔
139	{
140	size_t had_adapter = 0;	49✔
141	const size_t isize = insert_size(read1, read2);	49✔
142
143	if (offset >= 0) {	48✔
144	// Read1 can potentially extend past read2, but by definition read2
145	// cannot extend past read1 when the offset is not negative, so there
146	// is no need to edit read2.
147	had_adapter += isize < read1.length();	36✔
148	read1.truncate(0, isize);	36✔
149	} else {
150	had_adapter += isize < read1.length();	12✔
151	had_adapter += isize < read2.length();	12✔
152
153	read1.truncate(0, isize);	12✔
154	read2.truncate(read2.length() - isize);	24✔
155	}
156
157	return had_adapter;	48✔
158	}
159
160	size_t
161	alignment_info::insert_size(const fastq& read1, const fastq& read2) const	49✔
162	{
163	AR_REQUIRE(offset <= static_cast<int>(read1.length()));	102✔
164
165	return std::max<int>(0, static_cast<int>(read2.length()) + offset);	144✔
166	}
167
168	////////////////////////////////////////////////////////////////////////////////
169	// Implementations for `sequence_aligner`
170
171	sequence_aligner::sequence_aligner(const fastq_pair_vec& adapters,	114✔
172	simd::instruction_set is)	114✔
173	: m_adapters(adapters)	114✔
174	, m_compare_func(simd::get_compare_subsequences_func(is))	114✔
175	, m_padding(simd::padding(is))	114✔
176	, m_max_adapter_len_1()	114✔
177	, m_max_adapter_len_2()	114✔
178	{
179	for (const auto& it : m_adapters) {	948✔
180	m_max_adapter_len_1 = std::max(m_max_adapter_len_1, it.first.length());	369✔
181	m_max_adapter_len_2 = std::max(m_max_adapter_len_2, it.second.length());	369✔
182	}
183	}	114✔
184
185	alignment_info
186	sequence_aligner::align_single_end(const fastq& read, int max_shift) const	78✔
187	{
188	int adapter_id = 0;	78✔
189	alignment_info alignment;	78✔
190
191	std::string buffer;	78✔
192	buffer.reserve(m_max_adapter_len_1 + read.length() + 2 * m_padding);	156✔
193
194	for (const auto& adapter_pair : m_adapters) {	573✔
195	const auto& adapter = adapter_pair.first.sequence();	174✔
196
197	buffer.clear();	87✔
198	buffer += read.sequence();	174✔
199	buffer.resize(buffer.size() + m_padding, 'N');	174✔
200	buffer += adapter;	87✔
201	buffer.resize(buffer.size() + m_padding, 'N');	174✔
202
203	const char* read_data = buffer.data();	87✔
204	const char* adapter_data = buffer.data() + read.length() + m_padding;	261✔
205
206	if (pairwise_align_sequences(alignment,	174✔
207	read_data,
208	read.length(),	87✔
209	adapter_data,
210	adapter.length(),	87✔
211	-max_shift)) {
212	alignment.adapter_id = adapter_id;	66✔
213	}
214
215	++adapter_id;	87✔
216	}
217
218	return alignment;	156✔
219	}	78✔
220
221	alignment_info
222	sequence_aligner::align_paired_end(const fastq& read1,	36✔
223	const fastq& read2,
224	int max_shift) const
225	{
226	int adapter_id = 0;	36✔
227	alignment_info alignment;	36✔
228
229	std::string buffer;	36✔
230	buffer.reserve(m_max_adapter_len_2 + read1.length() + m_padding +	108✔
231	m_max_adapter_len_1 + read2.length() + m_padding);	72✔
232
233	for (const auto& adapter_pair : m_adapters) {	252✔
234	const fastq& adapter1 = adapter_pair.first;	36✔
235	const fastq& adapter2 = adapter_pair.second;	36✔
236
237	buffer.clear();	36✔
238	buffer += adapter2.sequence();	72✔
239	buffer += read1.sequence();	72✔
240	buffer.append(m_padding, 'N');	36✔
241	buffer += read2.sequence();	72✔
242	buffer += adapter1.sequence();	72✔
243	buffer.append(m_padding, 'N');	36✔
244
245	const char* sequence1 = buffer.data();	36✔
246	const size_t sequence1_len = adapter2.length() + read1.length();	72✔
247	const char* sequence2 = buffer.data() + sequence1_len + m_padding;	72✔
248	const size_t sequence2_len = adapter1.length() + read2.length();	72✔
249
250	// Only consider alignments where at least one nucleotide from each read
251	// is aligned against the other, included shifted alignments to account
252	// for missing bases at the 5' ends of the reads.
253	const int min_offset = adapter2.length() - read2.length() - max_shift;	72✔
254
255	if (pairwise_align_sequences(alignment,	36✔
256	sequence1,
257	sequence1_len,
258	sequence2,
259	sequence2_len,
260	min_offset)) {
261	alignment.adapter_id = adapter_id;	33✔
262	// Convert the alignment into an alignment between read 1 & 2 only
263	alignment.offset -= adapter2.length();	66✔
264	}
265
266	adapter_id++;	36✔
267	}
268
269	return alignment;	72✔
270	}	36✔
271
272	void
273	strip_mate_info(std::string& header, const char mate_sep)	33✔
274	{
275	size_t pos = header.find_first_of(' ');	33✔
276	if (pos == std::string::npos) {	33✔
277	pos = header.length();	62✔
278	}
279
280	if (pos >= 2 && header.at(pos - 2) == mate_sep) {	66✔
281	const char digit = header.at(pos - 1);	12✔
282
283	if (digit == '1' \|\| digit == '2') {	6✔
284	header.erase(pos - 2, 2);	6✔
285	}
286	}
287	}	33✔
288
289	sequence_merger::sequence_merger()	35✔
290	: m_merge_strategy(merge_strategy::conservative)	35✔
291	{
292	}	35✔
293
294	void
295	sequence_merger::set_mate_separator(char sep)	2✔
296	{
297	m_mate_sep = sep;	2✔
298	}	2✔
299
300	void
301	sequence_merger::set_merge_strategy(merge_strategy strategy)	19✔
302	{
303	m_merge_strategy = strategy;	19✔
304	}	19✔
305
306	void
307	sequence_merger::set_max_recalculated_score(char max)	×
308	{
309	m_max_score = max + '!';	×
310	}
311
312	void
313	sequence_merger::set_rng(std::mt19937* rng)	2✔
314	{
315	m_rng = rng;	2✔
316	}	2✔
317
318	void
319	sequence_merger::merge(const alignment_info& alignment,	35✔
320	fastq& read1,
321	const fastq& read2)
322	{
323	AR_REQUIRE(m_merge_strategy != merge_strategy::none);	35✔
324	AR_REQUIRE((m_merge_strategy == merge_strategy::original) == !!m_rng);	35✔
325
326	// Gap between the two reads is not allowed
327	AR_REQUIRE(alignment.offset <= static_cast<int>(read1.length()));	78✔
328
329	// Offset to the first base overlapping read 2
330	const auto read_1_offset = static_cast<size_t>(std::max(0, alignment.offset));	66✔
331	// Offset to the last base overlapping read 1
332	const size_t read_2_offset =	33✔
333	static_cast<int>(read1.length()) - std::max(0, alignment.offset);	99✔
334
335	AR_REQUIRE(read1.length() - read_1_offset == read_2_offset);	66✔
336
337	// Produce draft by merging r1 and the parts of r2 that extend past r1
338	read1.m_sequence.append(read2.sequence(), read_2_offset);	66✔
339	read1.m_qualities.append(read2.qualities(), read_2_offset);	66✔
340	AR_REQUIRE(read1.m_sequence.length() == read1.m_qualities.length());	99✔
341
342	// Pick the best bases for the overlapping part of the reads
343	for (size_t i = 0; i < read_2_offset; ++i) {	171✔
344	char& nt_1 = read1.m_sequence.at(i + read_1_offset);	276✔
345	char& qual_1 = read1.m_qualities.at(i + read_1_offset);	276✔
346	const char nt_2 = read2.sequence().at(i);	414✔
347	const char qual_2 = read2.qualities().at(i);	414✔
348
349	if (m_merge_strategy == merge_strategy::conservative) {	138✔
350	conservative_merge(nt_1, qual_1, nt_2, qual_2);	66✔
351	} else {
352	original_merge(nt_1, qual_1, nt_2, qual_2);	72✔
353	}
354	}
355
356	m_reads_merged += 2;	33✔
357	m_bases_merged += read_2_offset;	33✔
358
359	// Remove mate number from read, if present
360	if (m_mate_sep) {	33✔
361	strip_mate_info(read1.m_header, m_mate_sep);	33✔
362	}
363	}	33✔
364
365	void
366	sequence_merger::original_merge(char& nt_1,	72✔
367	char& qual_1,
368	char nt_2,
369	char qual_2)
370	{
371	if (nt_1 == 'N' \|\| nt_2 == 'N') {	72✔
372	// If one of the bases are N, then we suppose that we just have (at
373	// most) a single read at that site and choose that.
374	if (nt_1 == 'N' && nt_2 == 'N') {	33✔
375	qual_1 = PHRED_OFFSET_MIN;	2✔
376	} else if (nt_1 == 'N') {	31✔
377	nt_1 = nt_2;	4✔
378	qual_1 = qual_2;	4✔
379	}
380	} else if (nt_1 != nt_2 && qual_1 == qual_2) {	39✔
381	if (m_rng) {	3✔
382	nt_1 = ((*m_rng)() & 1) ? nt_1 : nt_2;	2✔
383
384	const phred_scores& new_scores = phred_scores::update(qual_1, qual_2);	2✔
385	qual_1 = new_scores.different_nts;	2✔
386	} else {
387	nt_1 = 'N';	1✔
388	qual_1 = PHRED_OFFSET_MIN;	1✔
389	}
390
391	m_mismatches_unresolved++;	3✔
392	} else {	3✔
393	// Ensure that nt_1 / qual_1 always contains the preferred nt / score
394	// This is an assumption of the g_updated_phred_scores cache.
395	if (qual_1 < qual_2) {	36✔
396	std::swap(nt_1, nt_2);	8✔
397	std::swap(qual_1, qual_2);	8✔
398	}
399
400	const phred_scores& new_scores = phred_scores::update(qual_1, qual_2);	36✔
401
402	if (nt_1 == nt_2) {	36✔
403	qual_1 = new_scores.identical_nts;	32✔
404	} else {
405	qual_1 = new_scores.different_nts;	4✔
406	m_mismatches_resolved++;	4✔
407	}
408
409	qual_1 = std::min<char>(m_max_score, qual_1);	72✔
410	}
411	}	72✔
412
413	void
414	sequence_merger::conservative_merge(char& nt_1,	66✔
415	char& qual_1,
416	const char nt_2,
417	const char qual_2)
418	{
419	if (nt_2 == 'N' \|\| nt_1 == 'N') {	66✔
420	if (nt_1 == 'N' && nt_2 == 'N') {	33✔
421	qual_1 = PHRED_OFFSET_MIN;	2✔
422	} else if (nt_1 == 'N') {	31✔
423	nt_1 = nt_2;	4✔
424	qual_1 = qual_2;	4✔
425	}
426	} else if (nt_1 == nt_2) {	33✔
427	qual_1 = std::max(qual_1, qual_2);	56✔
428	} else {
429	if (qual_1 < qual_2) {	5✔
430	nt_1 = nt_2;	2✔
431	qual_1 = qual_2 - qual_1 + PHRED_OFFSET_MIN;	2✔
432	m_mismatches_resolved++;	2✔
433	} else if (qual_1 > qual_2) {	3✔
434	qual_1 = qual_1 - qual_2 + PHRED_OFFSET_MIN;	2✔
435	m_mismatches_resolved++;	2✔
436	} else {
437	// No way to reasonably pick a base
438	nt_1 = 'N';	1✔
439	qual_1 = PHRED_OFFSET_MIN;	1✔
440	m_mismatches_unresolved++;	1✔
441	}
442	}
443	}	66✔
444
445	bool
446	extract_adapter_sequences(const alignment_info& alignment,	12✔
447	fastq& read1,
448	fastq& read2)
449	{
450	AR_REQUIRE(alignment.offset <= static_cast<int>(read1.length()));	24✔
451	const int template_length =	12✔
452	std::max(0, static_cast<int>(read2.length()) + alignment.offset);	36✔
453
454	read1.truncate(std::min<size_t>(read1.length(), template_length));	36✔
455	read2.truncate(	12✔
456	0, std::max<int>(0, static_cast<int>(read2.length()) - template_length));	36✔
457
458	return read1.sequence().length() \|\| read2.sequence().length();	57✔
459	}
460
461	} // namespace adapterremoval

MikkelSchubert / adapterremoval / #36

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous