• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #36

22 Jul 2024 09:33AM UTC coverage: 87.26% (-12.7%) from 100.0%
#36

push

travis-ci

MikkelSchubert
remove duplicate tests

2185 of 2504 relevant lines covered (87.26%)

16293.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.56
/src/barcode_table.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "barcode_table.hpp"
21
#include "debug.hpp"  // for AR_REQUIRE
22
#include "errors.hpp" // for parsing_error
23
#include <algorithm>  // for min, max, sort
24
#include <utility>    // for pair
25

26
namespace adapterremoval {
27

28
using barcode_pair = std::pair<std::string, size_t>;
29
using barcode_vec = std::vector<barcode_pair>;
30

31
struct next_subsequence
32
{
33
  explicit next_subsequence(const char* seq_,
35✔
34
                            const size_t max_local_mismatches_)
35
    : seq(seq_)
70✔
36
    , max_local_mismatches(max_local_mismatches_)
35✔
37
  {
38
  }
39

40
  const char* seq;
41
  const size_t max_local_mismatches;
42
};
43

44
///////////////////////////////////////////////////////////////////////////////
45

46
demultiplexer_node::demultiplexer_node()
×
47
  : children()
1,916✔
48
  , value(barcode_table::no_match)
479✔
49
{
50
  children.fill(barcode_table::no_match);
479✔
51
}
52

53
barcode_table::candidate::candidate(int barcode_, size_t mismatches_)
620✔
54
  : barcode(barcode_)
×
55
  , mismatches(mismatches_)
620✔
56
{
57
}
58

59
///////////////////////////////////////////////////////////////////////////////
60

61
/**
62
 * Returns a lexicographically sorted list of merged barcodes, each paired with
63
 * the 0-based index of corresponding barcode in the source vector.
64
 */
65
barcode_vec
66
sort_barcodes(const fastq_pair_vec& barcodes)
39✔
67
{
68
  AR_REQUIRE(!barcodes.empty());
78✔
69
  barcode_vec sorted_barcodes;
39✔
70

71
  const size_t max_key_1_len = barcodes.front().first.length();
78✔
72
  const size_t max_key_2_len = barcodes.front().second.length();
78✔
73
  for (auto it = barcodes.begin(); it != barcodes.end(); ++it) {
448✔
74
    if (it->first.length() != max_key_1_len) {
158✔
75
      throw parsing_error("mate 1 barcodes do not have the same length");
10✔
76
    } else if (it->second.length() != max_key_2_len) {
148✔
77
      throw parsing_error("mate 2 barcodes do not have the same length");
3✔
78
    }
79

80
    std::string barcode;
73✔
81
    barcode.reserve(max_key_1_len + max_key_2_len);
73✔
82
    barcode.append(it->first.sequence());
146✔
83
    barcode.append(it->second.sequence());
146✔
84

85
    sorted_barcodes.emplace_back(barcode, it - barcodes.begin());
219✔
86
  }
73✔
87

88
  std::sort(sorted_barcodes.begin(), sorted_barcodes.end());
99✔
89

90
  return sorted_barcodes;
33✔
91
}
6✔
92

93
/** Adds a nucleotide sequence with a given ID to a quad-tree. */
94
void
95
add_sequence_to_tree(demux_node_vec& tree,
67✔
96
                     const std::string& sequence,
97
                     const size_t barcode_id)
98
{
99
  size_t node_idx = 0;
67✔
100
  bool added_last_node = false;
67✔
101
  for (auto nuc : sequence) {
1,816✔
102
    auto& node = tree.at(node_idx);
1,032✔
103
    // Indicate when PE barcodes can be unambiguously identified from SE
104
    // reads
105
    node.value = (node.value == barcode_table::no_match)
516✔
106
                   ? barcode_id
107
                   : barcode_table::ambiguous;
108

109
    const auto nuc_idx = ACGT::to_index(nuc);
516✔
110
    auto child = node.children[nuc_idx];
1,032✔
111

112
    added_last_node = (child == barcode_table::no_match);
516✔
113
    if (added_last_node) {
516✔
114
      // New nodes are added to the end of the list; as barcodes are
115
      // added in lexicographic order, this helps ensure that a set of
116
      // similar barcodes will be placed in mostly contiguous runs
117
      // of the vector representation.
118
      child = node.children[nuc_idx] = tree.size();
1,338✔
119
      tree.push_back(demultiplexer_node());
1,338✔
120
    }
121

122
    node_idx = child;
516✔
123
  }
124

125
  if (!added_last_node) {
67✔
126
    throw parsing_error(std::string("duplicate barcode (pair): ") + sequence);
10✔
127
  }
128

129
  tree.at(node_idx).value = barcode_id;
130✔
130
}
65✔
131

132
/**
133
 * Builds a sparse quad tree using the first sequence in a set of unique
134
 * barcodes pairs; duplicate pairs will negatively impact the identification of
135
 * these, since all hits will be considered ambiguous.
136
 */
137
demux_node_vec
138
build_demux_tree(const fastq_pair_vec& barcodes)
39✔
139
{
140
  // Step 1: Construct list of merged, sorted barcodes barcodes;
141
  //         this allows construction of the sparse tree in one pass.
142
  const barcode_vec sorted_barcodes = sort_barcodes(barcodes);
39✔
143

144
  // Step 2: Create empty tree containing just the root node; creating
145
  //         the root here simplifies the 'add_sequence_to_tree' function.
146
  demux_node_vec tree;
33✔
147
  tree.push_back(demultiplexer_node());
99✔
148

149
  // Step 3: Add each barcode to the tree, in sorted order
150
  for (const auto& pair : sorted_barcodes) {
394✔
151
    add_sequence_to_tree(tree, pair.first, pair.second);
67✔
152
  }
153

154
  return tree;
31✔
155
}
33✔
156

157
///////////////////////////////////////////////////////////////////////////////
158

159
const int barcode_table::no_match;
160
const int barcode_table::ambiguous;
161

162
barcode_table::barcode_table(const fastq_pair_vec& barcodes,
40✔
163
                             size_t max_mm,
164
                             size_t max_mm_r1,
165
                             size_t max_mm_r2)
88✔
166
{
167
  m_max_mismatches = std::min<size_t>(max_mm, max_mm_r1 + max_mm_r2);
80✔
168
  m_max_mismatches_r1 = std::min<size_t>(m_max_mismatches, max_mm_r1);
80✔
169
  m_max_mismatches_r2 = std::min<size_t>(m_max_mismatches, max_mm_r2);
80✔
170

171
  if (!barcodes.empty()) {
80✔
172
    m_nodes = build_demux_tree(barcodes);
101✔
173
    m_barcode_1_len = barcodes.front().first.length();
93✔
174
    m_barcode_2_len = barcodes.front().second.length();
93✔
175
  }
176
}
40✔
177

178
int
179
barcode_table::identify(const fastq& read_r1) const
65✔
180
{
181
  if (read_r1.length() < m_barcode_1_len) {
130✔
182
    return barcode_table::no_match;
183
  }
184

185
  const std::string barcode = read_r1.sequence().substr(0, m_barcode_1_len);
126✔
186
  auto match = lookup(barcode.c_str(), 0, 0, nullptr);
126✔
187
  if (match.barcode == no_match && m_max_mismatches) {
63✔
188
    match = lookup_with_mm(
32✔
189
      barcode.c_str(), 0, m_max_mismatches, m_max_mismatches_r1, nullptr);
16✔
190
  }
191

192
  return match.barcode;
63✔
193
}
65✔
194

195
int
196
barcode_table::identify(const fastq& read_r1, const fastq& read_r2) const
96✔
197
{
198
  if (read_r1.length() < m_barcode_1_len ||
192✔
199
      read_r2.length() < m_barcode_2_len) {
184✔
200
    return no_match;
6✔
201
  }
202

203
  const auto barcode_1 = read_r1.sequence().substr(0, m_barcode_1_len);
180✔
204
  const auto barcode_2 = read_r2.sequence().substr(0, m_barcode_2_len);
180✔
205
  const auto combined_barcode = barcode_1 + barcode_2;
90✔
206

207
  auto match = lookup(combined_barcode.c_str(), 0, 0, nullptr);
180✔
208
  if (match.barcode == no_match && m_max_mismatches) {
90✔
209
    const next_subsequence next(barcode_2.c_str(), m_max_mismatches_r2);
70✔
210

211
    if (m_max_mismatches_r1) {
35✔
212
      match = lookup_with_mm(
54✔
213
        barcode_1.c_str(), 0, m_max_mismatches, m_max_mismatches_r1, &next);
27✔
214
    } else {
215
      match = lookup(barcode_1.c_str(), 0, m_max_mismatches, &next);
16✔
216
    }
217
  }
218

219
  return match.barcode;
90✔
220
}
276✔
221

222
barcode_table::candidate
223
barcode_table::lookup(const char* seq,
231✔
224
                      int parent,
225
                      const size_t max_global_mismatches,
226
                      const next_subsequence* next) const
227

228
{
229
  for (; *seq && parent != no_match; ++seq) {
1,149✔
230
    if (*seq == 'N') {
950✔
231
      return candidate(no_match);
64✔
232
    }
233

234
    const auto encoded_nuc = ACGT::to_index(*seq);
918✔
235
    const auto& node = m_nodes.at(parent);
1,836✔
236

237
    parent = node.children.at(encoded_nuc);
1,836✔
238
  }
239

240
  if (parent == no_match) {
199✔
241
    return candidate(no_match);
154✔
242
  } else if (next) {
122✔
243
    const auto max_local_mismatches =
17✔
244
      std::min(max_global_mismatches, next->max_local_mismatches);
34✔
245

246
    if (max_local_mismatches) {
17✔
247
      return lookup_with_mm(next->seq,
4✔
248
                            parent,
249
                            max_global_mismatches,
250
                            max_local_mismatches,
251
                            nullptr);
4✔
252
    } else {
253
      return lookup(next->seq, parent, max_global_mismatches, nullptr);
13✔
254
    }
255
  } else {
256
    const auto& node = m_nodes.at(parent);
210✔
257
    return candidate(node.value, m_max_mismatches - max_global_mismatches);
210✔
258
  }
259
}
260

261
barcode_table::candidate
262
barcode_table::lookup_with_mm(const char* seq,
208✔
263
                              int parent,
264
                              const size_t max_global_mismatches,
265
                              const size_t max_local_mismatches,
266
                              const next_subsequence* next) const
267

268
{
269
  const demultiplexer_node& node = m_nodes.at(parent);
416✔
270
  const auto nucleotide = *seq;
208✔
271

272
  if (nucleotide) {
208✔
273
    candidate best_candidate;
196✔
274

275
    for (size_t encoded_i = 0; encoded_i < 4; ++encoded_i) {
980✔
276
      const auto child = node.children.at(encoded_i);
1,568✔
277

278
      if (child != -1) {
784✔
279
        candidate current_candidate;
208✔
280

281
        if (nucleotide == ACGT::to_value(encoded_i)) {
208✔
282
          current_candidate = lookup_with_mm(
144✔
283
            seq + 1, child, max_global_mismatches, max_local_mismatches, next);
284
        } else if (max_local_mismatches == 1) {
64✔
285
          current_candidate =
55✔
286
            lookup(seq + 1, child, max_global_mismatches - 1, next);
110✔
287
        } else if (max_local_mismatches) {
9✔
288
          current_candidate = lookup_with_mm(seq + 1,
18✔
289
                                             child,
290
                                             max_global_mismatches - 1,
9✔
291
                                             max_local_mismatches - 1,
292
                                             next);
293
        }
294

295
        if (current_candidate.mismatches < best_candidate.mismatches) {
208✔
296
          best_candidate = current_candidate;
144✔
297
        } else if (current_candidate.mismatches == best_candidate.mismatches &&
64✔
298
                   current_candidate.barcode != no_match) {
59✔
299
          best_candidate.barcode = ambiguous;
6✔
300
        }
301
      }
302
    }
303

304
    return best_candidate;
196✔
305
  } else if (next) {
12✔
306
    const size_t next_max_local_mismatches =
10✔
307
      std::min(max_global_mismatches, next->max_local_mismatches);
20✔
308

309
    if (next_max_local_mismatches) {
10✔
310
      return lookup_with_mm(next->seq,
8✔
311
                            parent,
312
                            max_global_mismatches,
313
                            next_max_local_mismatches,
314
                            nullptr);
8✔
315
    } else {
316
      return lookup(next->seq, parent, max_global_mismatches, nullptr);
2✔
317
    }
318
  } else {
319
    return candidate(node.value, m_max_mismatches - max_global_mismatches);
4✔
320
  }
321
}
322

323
} // namespace adapterremoval
1,078✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc