• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #45

20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%
#45

push

travis-ci

web-flow
attempt to fix coveralls run

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/adapter_id.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2015 by Mikkel Schubert - mikkelsch@gmail.com           *
5
 *                                                                       *
6
 * This program is free software: you can redistribute it and/or modify  *
7
 * it under the terms of the GNU General Public License as published by  *
8
 * the Free Software Foundation, either version 3 of the License, or     *
9
 * (at your option) any later version.                                   *
10
 *                                                                       *
11
 * This program is distributed in the hope that it will be useful,       *
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
14
 * GNU General Public License for more details.                          *
15
 *                                                                       *
16
 * You should have received a copy of the GNU General Public License     *
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
18
\*************************************************************************/
19
#include "adapter_id.hpp" // declarations
20
#include "utilities.hpp"  // for merge
21
#include <algorithm>      // for max, copy, min, reverse
22
#include <queue>          // for priority_queue
23
#include <sstream>        // for basic_ostringstream
24

25
namespace adapterremoval {
26

27
///////////////////////////////////////////////////////////////////////////////
28
// Utility functions
29

30
namespace {
31

32
using adapter_kmer = std::pair<size_t, uint32_t>;
33
using adapter_kmer_vec = std::vector<adapter_kmer>;
34

35
/**
36
 * Hashing function for string consisting of the chars "ACGT" (uppercase only).
37
 * Will return a unique number in the range 0 to 4^N - 1 for a given nucleotide
38
 * sequence. Passing characters other than "ACGT" (uppercase only) will result
39
 * in hash collisions.
40
 */
41
inline size_t
42
kmer_to_size_t(const std::string& kmer)
×
43
{
44
  size_t index = 0;
×
45
  for (const char c : kmer) {
×
46
    index = (index << 2) | ACGT::to_index(c);
×
47
  }
48

49
  return index;
×
50
}
51

52
/** Translates a hash generated using kmer_to_size_t into a NT sequence. */
53
inline std::string
54
size_t_to_kmer(size_t kmer)
×
55
{
56
  std::string kmer_s(consensus_adapter_stats::kmer_length, 'N');
×
57
  for (size_t i = 1; i <= consensus_adapter_stats::kmer_length; ++i) {
×
58
    kmer_s.at(consensus_adapter_stats::kmer_length - i) =
×
59
      ACGT::to_value(kmer & 0x3);
×
60
    kmer = kmer >> 2;
×
61
  }
62

63
  return kmer_s;
×
64
}
×
65

66
/** Functor for sorting kmers by frequency. */
67
struct cmp_nt_count
68
{
69
  bool operator()(const adapter_kmer& a, const adapter_kmer& b) const
×
70
  {
71
    return (a.second > b.second);
×
72
  }
73
};
74

75
using kmer_queue =
76
  std::priority_queue<adapter_kmer, adapter_kmer_vec, cmp_nt_count>;
77

78
/**
79
 * Takes an indexed_counts object, and returns a fastq sequence containing the
80
 * majority nucleotide at each position.The quality score of each base is
81
 * calculated as the proportion of the bases which match the majority nucleotide
82
 * (p = m / (N + 1)). If no majority nucleotide can be found 'N' is used
83
 * instead.
84
 */
85
fastq
86
build_consensus_sequence(const indexed_counts<ACGTN>& consensus)
×
87
{
88
  std::ostringstream sequence;
×
89
  std::ostringstream qualities;
×
90

91
  for (size_t i = 0; i < consensus.size(); ++i) {
×
92
    // Always assume one non-consensus observation; this is more reasonable
93
    // than allowing an error-rate of 0, especially for few observations.
94
    size_t total_count = 1;
×
95

96
    char best_nuc = 'N';
×
97
    size_t best_count = 0;
×
98
    for (const char nuc : ACGT::values) {
×
99
      const size_t cur_count = consensus.get(nuc, i);
×
100
      total_count += cur_count;
×
101

102
      if (cur_count > best_count) {
×
103
        best_nuc = nuc;
104
        best_count = cur_count;
105
      } else if (cur_count == best_count) {
×
106
        best_nuc = 'N';
×
107
      }
108
    }
109

110
    sequence << best_nuc;
×
111

112
    if (best_nuc == 'N') {
×
113
      qualities << static_cast<char>(PHRED_OFFSET_MIN);
×
114
    } else {
115
      const double pvalue = 1.0 - best_count / static_cast<double>(total_count);
×
116
      qualities << fastq::p_to_phred_33(pvalue);
×
117
    }
118
  }
119

120
  return { "consensus", sequence.str(), qualities.str() };
×
121
}
122

123
} // namespace
124

125
///////////////////////////////////////////////////////////////////////////////
126
// consensus_adapter
127

128
consensus_adapter::consensus_adapter(const indexed_counts<ACGTN>& consensus,
×
129
                                     const kmer_map& kmers,
130
                                     const size_t n_kmers)
×
131
  : m_adapter(build_consensus_sequence(consensus))
×
132
{
133
  kmer_queue queue;
×
134
  for (size_t i = 0; i < kmers.size(); ++i) {
×
135
    adapter_kmer value(i, kmers.at(i));
×
136
    m_total_kmers += value.second;
×
137

138
    if (queue.size() >= n_kmers) {
×
139
      // The top value will be the currently lowest value in the queue
140
      if (queue.top().second < value.second) {
×
141
        queue.pop();
×
142
        queue.push(value);
×
143
      }
144
    } else if (value.second) {
×
145
      queue.push(value);
×
146
    }
147
  }
148

149
  while (!queue.empty()) {
×
150
    const auto pair = queue.top();
×
151

152
    m_top_kmers.emplace_back(size_t_to_kmer(pair.first), pair.second);
×
153
    queue.pop();
×
154
  }
155

156
  std::reverse(m_top_kmers.begin(), m_top_kmers.end());
×
157
}
158

159
std::string
160
consensus_adapter::compare_with(const std::string& other) const
×
161
{
162
  const auto& adapter = m_adapter.sequence();
×
163
  const auto size = std::min(adapter.size(), other.size());
×
164

165
  std::ostringstream identity;
×
166
  for (size_t i = 0; i < size; ++i) {
×
167
    if (adapter.at(i) == 'N' || other.at(i) == 'N') {
×
168
      identity << '*';
×
169
    } else {
170
      identity << (adapter.at(i) == other.at(i) ? '|' : ' ');
×
171
    }
172
  }
173

174
  return identity.str();
×
175
}
176

177
///////////////////////////////////////////////////////////////////////////////
178
// consensus_adapter_stats
179

180
consensus_adapter_stats::consensus_adapter_stats(size_t max_length)
×
181
  : m_max_length(max_length)
×
182
  , m_kmers(kmer_count, 0)
×
183
{
184
}
185

186
consensus_adapter_stats&
187
consensus_adapter_stats::operator+=(const consensus_adapter_stats& other)
×
188
{
189
  m_consensus += other.m_consensus;
×
190
  merge(m_kmers, other.m_kmers);
×
191

192
  return *this;
×
193
}
194

195
void
196
consensus_adapter_stats::process(const std::string& sequence)
×
197
{
198
  const auto length = std::min(m_max_length, sequence.length());
×
199

200
  m_consensus.resize_up_to(length);
×
201
  for (size_t i = 0; i < length; ++i) {
×
202
    m_consensus.inc(sequence.at(i), i);
×
203
  }
204

205
  if (sequence.length() >= consensus_adapter_stats::kmer_length) {
×
206
    const std::string kmer =
×
207
      sequence.substr(0, consensus_adapter_stats::kmer_length);
×
208
    if (kmer.find('N') == std::string::npos) {
×
209
      m_kmers.at(kmer_to_size_t(kmer)) += 1;
×
210
    }
211
  }
212
}
213

214
consensus_adapter
215
consensus_adapter_stats::summarize(size_t n_kmers) const
×
216
{
217
  return { m_consensus, m_kmers, n_kmers };
×
218
}
219

220
adapter_id_statistics::adapter_id_statistics(size_t max_length)
×
221
  : adapter1(max_length)
×
222
  , adapter2(max_length)
×
223
  , aligned_pairs(0)
×
224
  , pairs_with_adapters(0)
×
225
{
226
}
227

228
/** Merge overall trimming_statistics, consensus, and k-mer counts. */
229
adapter_id_statistics&
230
adapter_id_statistics::operator+=(const adapter_id_statistics& other)
×
231
{
232
  adapter1 += other.adapter1;
×
233
  adapter2 += other.adapter2;
×
234
  aligned_pairs += other.aligned_pairs;
×
235
  pairs_with_adapters += other.pairs_with_adapters;
×
236

237
  return *this;
×
238
}
239

240
} // namespace adapterremoval
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc