• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #106

28 Apr 2025 11:42AM UTC coverage: 66.997% (+0.07%) from 66.927%
#106

push

travis-ci

web-flow
gather phred<->p code and fix reported mott value (#132)

42 of 47 new or added lines in 6 files covered. (89.36%)

8 existing lines in 1 file now uncovered.

9732 of 14526 relevant lines covered (67.0%)

3044.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/adapter_id.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2015 Mikkel Schubert <mikkelsch@gmail.com>
3
#include "adapter_id.hpp" // declarations
4
#include "fastq_enc.hpp"  // for fastq_encoding
5
#include "utilities.hpp"  // for merge
6
#include <algorithm>      // for max, copy, min, reverse
7
#include <queue>          // for priority_queue
8
#include <sstream>        // for basic_ostringstream
9

10
namespace adapterremoval {
11

12
///////////////////////////////////////////////////////////////////////////////
13
// Utility functions
14

15
namespace {
16

17
using adapter_kmer = std::pair<size_t, uint32_t>;
18
using adapter_kmer_vec = std::vector<adapter_kmer>;
19

20
/**
21
 * Hashing function for string consisting of the chars "ACGT" (uppercase only).
22
 * Will return a unique number in the range 0 to 4^N - 1 for a given nucleotide
23
 * sequence. Passing characters other than "ACGT" (uppercase only) will result
24
 * in hash collisions.
25
 */
26
inline size_t
27
kmer_to_size_t(const std::string& kmer)
×
28
{
29
  size_t index = 0;
×
30
  for (const char c : kmer) {
×
31
    index = (index << 2) | ACGT::to_index(c);
×
32
  }
33

34
  return index;
×
35
}
36

37
/** Translates a hash generated using kmer_to_size_t into a NT sequence. */
38
inline std::string
39
size_t_to_kmer(size_t kmer)
×
40
{
41
  std::string kmer_s(consensus_adapter_stats::kmer_length, 'N');
×
42
  for (size_t i = 1; i <= consensus_adapter_stats::kmer_length; ++i) {
×
43
    kmer_s.at(consensus_adapter_stats::kmer_length - i) =
×
44
      ACGT::to_value(kmer & 0x3);
×
45
    kmer = kmer >> 2;
×
46
  }
47

48
  return kmer_s;
×
49
}
×
50

51
/** Functor for sorting kmers by frequency. */
52
struct cmp_nt_count
53
{
54
  bool operator()(const adapter_kmer& a, const adapter_kmer& b) const
×
55
  {
56
    return (a.second > b.second);
×
57
  }
58
};
59

60
using kmer_queue =
61
  std::priority_queue<adapter_kmer, adapter_kmer_vec, cmp_nt_count>;
62

63
/**
64
 * Takes an indexed_counts object, and returns a fastq sequence containing the
65
 * majority nucleotide at each position.The quality score of each base is
66
 * calculated as the proportion of the bases which match the majority nucleotide
67
 * (p = m / (N + 1)). If no majority nucleotide can be found 'N' is used
68
 * instead.
69
 */
70
fastq
71
build_consensus_sequence(const indexed_counts<ACGTN>& consensus)
×
72
{
73
  std::ostringstream sequence;
×
74
  std::ostringstream qualities;
×
75

76
  for (size_t i = 0; i < consensus.size(); ++i) {
×
77
    // Always assume one non-consensus observation; this is more reasonable
78
    // than allowing an error-rate of 0, especially for few observations.
79
    size_t total_count = 1;
×
80

81
    char best_nuc = 'N';
×
82
    size_t best_count = 0;
×
83
    for (const char nuc : ACGT::values) {
×
84
      const size_t cur_count = consensus.get(nuc, i);
×
85
      total_count += cur_count;
×
86

87
      if (cur_count > best_count) {
×
88
        best_nuc = nuc;
89
        best_count = cur_count;
90
      } else if (cur_count == best_count) {
×
91
        best_nuc = 'N';
×
92
      }
93
    }
94

95
    sequence << best_nuc;
×
96

97
    if (best_nuc == 'N') {
×
98
      qualities << static_cast<char>(PHRED_OFFSET_MIN);
×
99
    } else {
100
      const double pvalue = 1.0 - best_count / static_cast<double>(total_count);
×
NEW
101
      qualities << fastq_encoding::p_to_phred_33(pvalue);
×
102
    }
103
  }
104

105
  return { "consensus", sequence.str(), qualities.str() };
×
106
}
107

108
} // namespace
109

110
///////////////////////////////////////////////////////////////////////////////
111
// consensus_adapter
112

113
consensus_adapter::consensus_adapter(const indexed_counts<ACGTN>& consensus,
×
114
                                     const kmer_map& kmers,
115
                                     const size_t n_kmers)
×
116
  : m_adapter(build_consensus_sequence(consensus))
×
117
{
118
  kmer_queue queue;
×
119
  for (size_t i = 0; i < kmers.size(); ++i) {
×
120
    adapter_kmer value(i, kmers.at(i));
×
121
    m_total_kmers += value.second;
×
122

123
    if (queue.size() >= n_kmers) {
×
124
      // The top value will be the currently lowest value in the queue
125
      if (queue.top().second < value.second) {
×
126
        queue.pop();
×
127
        queue.push(value);
×
128
      }
129
    } else if (value.second) {
×
130
      queue.push(value);
×
131
    }
132
  }
133

134
  while (!queue.empty()) {
×
135
    const auto pair = queue.top();
×
136

137
    m_top_kmers.emplace_back(size_t_to_kmer(pair.first), pair.second);
×
138
    queue.pop();
×
139
  }
140

141
  std::reverse(m_top_kmers.begin(), m_top_kmers.end());
×
142
}
143

144
std::string
145
consensus_adapter::compare_with(const std::string& other) const
×
146
{
147
  const auto& adapter = m_adapter.sequence();
×
148
  const auto size = std::min(adapter.size(), other.size());
×
149

150
  std::ostringstream identity;
×
151
  for (size_t i = 0; i < size; ++i) {
×
152
    if (adapter.at(i) == 'N' || other.at(i) == 'N') {
×
153
      identity << '*';
×
154
    } else {
155
      identity << (adapter.at(i) == other.at(i) ? '|' : ' ');
×
156
    }
157
  }
158

159
  return identity.str();
×
160
}
161

162
///////////////////////////////////////////////////////////////////////////////
163
// consensus_adapter_stats
164

165
consensus_adapter_stats::consensus_adapter_stats(size_t max_length)
×
166
  : m_max_length(max_length)
×
167
  , m_kmers(kmer_count, 0)
×
168
{
169
}
170

171
consensus_adapter_stats&
172
consensus_adapter_stats::operator+=(const consensus_adapter_stats& other)
×
173
{
174
  m_consensus += other.m_consensus;
×
175
  merge(m_kmers, other.m_kmers);
×
176

177
  return *this;
×
178
}
179

180
void
181
consensus_adapter_stats::process(const std::string& sequence)
×
182
{
183
  const auto length = std::min(m_max_length, sequence.length());
×
184

185
  m_consensus.resize_up_to(length);
×
186
  for (size_t i = 0; i < length; ++i) {
×
187
    m_consensus.inc(sequence.at(i), i);
×
188
  }
189

190
  if (sequence.length() >= consensus_adapter_stats::kmer_length) {
×
191
    const std::string kmer =
×
192
      sequence.substr(0, consensus_adapter_stats::kmer_length);
×
193
    if (kmer.find('N') == std::string::npos) {
×
194
      m_kmers.at(kmer_to_size_t(kmer)) += 1;
×
195
    }
196
  }
197
}
198

199
consensus_adapter
200
consensus_adapter_stats::summarize(size_t n_kmers) const
×
201
{
202
  return { m_consensus, m_kmers, n_kmers };
×
203
}
204

205
adapter_id_statistics::adapter_id_statistics(size_t max_length)
×
206
  : adapter1(max_length)
×
207
  , adapter2(max_length)
×
208
  , aligned_pairs(0)
×
209
  , pairs_with_adapters(0)
×
210
{
211
}
212

213
/** Merge overall trimming_statistics, consensus, and k-mer counts. */
214
adapter_id_statistics&
215
adapter_id_statistics::operator+=(const adapter_id_statistics& other)
×
216
{
217
  adapter1 += other.adapter1;
×
218
  adapter2 += other.adapter2;
×
219
  aligned_pairs += other.aligned_pairs;
×
220
  pairs_with_adapters += other.pairs_with_adapters;
×
221

222
  return *this;
×
223
}
224

225
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc