• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #43

08 Sep 2024 08:47AM UTC coverage: 75.266% (-4.5%) from 79.763%
#43

push

travis-ci

MikkelSchubert
minor improvements to assert signatures and silence lints

1 of 1 new or added line in 1 file covered. (100.0%)

75 existing lines in 3 files now uncovered.

2404 of 3194 relevant lines covered (75.27%)

12788.78 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.62
/src/barcode_table.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "barcode_table.hpp"
21
#include "debug.hpp"         // for AR_REQUIRE
22
#include "errors.hpp"        // for parsing_error
23
#include "fastq.hpp"         // for fastq
24
#include "sequence_sets.hpp" // for sample_set
25
#include <algorithm>         // for min, max, sort
26
#include <utility>           // for pair
27

28
namespace adapterremoval {
29

30
using barcode_pair = std::pair<std::string, size_t>;
31
using barcode_vec = std::vector<barcode_pair>;
32

33
struct next_subsequence
34
{
35
  explicit next_subsequence(const char* seq_,
35✔
36
                            const size_t max_local_mismatches_)
37
    : seq(seq_)
70✔
38
    , max_local_mismatches(max_local_mismatches_)
35✔
39
  {
40
  }
41

42
  const char* seq;
43
  const size_t max_local_mismatches;
44
};
45

46
///////////////////////////////////////////////////////////////////////////////
47

48
demultiplexer_node::demultiplexer_node()
479✔
49
  : children()
1,916✔
50
  , value(barcode_table::no_match)
479✔
51
{
52
  children.fill(barcode_table::no_match);
958✔
53
}
54

55
barcode_table::candidate::candidate(int barcode_, size_t mismatches_)
620✔
UNCOV
56
  : barcode(barcode_)
×
57
  , mismatches(mismatches_)
620✔
58
{
59
}
60

61
///////////////////////////////////////////////////////////////////////////////
62

63
namespace {
64

65
/**
66
 * Returns a lexicographically sorted list of merged barcodes, each paired with
67
 * the 0-based index of corresponding barcode in the source vector.
68
 */
69
barcode_vec
70
sort_barcodes(const sequence_pair_vec& barcodes)
39✔
71
{
72
  AR_REQUIRE(!barcodes.empty());
78✔
73
  barcode_vec sorted_barcodes;
39✔
74

75
  const size_t key_1_len = barcodes.front().first.length();
78✔
76
  const size_t key_2_len = barcodes.front().second.length();
78✔
77
  for (auto it = barcodes.begin(); it != barcodes.end(); ++it) {
448✔
78
    if (it->first.length() != key_1_len) {
158✔
79
      throw parsing_error("mate 1 barcodes do not have the same length");
20✔
80
    } else if (it->second.length() != key_2_len) {
148✔
81
      throw parsing_error("mate 2 barcodes do not have the same length");
3✔
82
    }
83

84
    std::string barcode;
73✔
85
    barcode.reserve(key_1_len + key_2_len);
73✔
86
    barcode.append(it->first);
73✔
87
    barcode.append(it->second);
73✔
88

89
    sorted_barcodes.emplace_back(barcode, it - barcodes.begin());
219✔
90
  }
73✔
91

92
  std::sort(sorted_barcodes.begin(), sorted_barcodes.end());
99✔
93

94
  return sorted_barcodes;
33✔
95
}
6✔
96

97
/** Adds a nucleotide sequence with a given ID to a quad-tree. */
98
void
99
add_sequence_to_tree(demux_node_vec& tree,
67✔
100
                     const std::string& sequence,
101
                     const size_t barcode_id)
102
{
103
  size_t node_idx = 0;
67✔
104
  bool added_last_node = false;
67✔
105
  for (auto nuc : sequence) {
1,816✔
106
    auto& node = tree.at(node_idx);
1,032✔
107
    // Indicate when PE barcodes can be unambiguously identified from SE
108
    // reads
109
    node.value = (node.value == barcode_table::no_match)
516✔
110
                   ? barcode_id
111
                   : barcode_table::ambiguous;
112

113
    const auto nuc_idx = ACGT::to_index(nuc);
516✔
114
    auto child = node.children[nuc_idx];
1,032✔
115

116
    added_last_node = (child == barcode_table::no_match);
516✔
117
    if (added_last_node) {
516✔
118
      // New nodes are added to the end of the list; as barcodes are
119
      // added in lexicographic order, this helps ensure that a set of
120
      // similar barcodes will be placed in mostly contiguous runs
121
      // of the vector representation.
122
      child = node.children[nuc_idx] = tree.size();
1,338✔
123
      tree.emplace_back();
446✔
124
    }
125

126
    node_idx = child;
516✔
127
  }
128

129
  if (!added_last_node) {
67✔
130
    throw parsing_error(std::string("duplicate barcode (pair): ") + sequence);
10✔
131
  }
132

133
  tree.at(node_idx).value = barcode_id;
130✔
134
}
65✔
135

136
/**
137
 * Builds a sparse quad tree using the first sequence in a set of unique
138
 * barcodes pairs; duplicate pairs will negatively impact the identification of
139
 * these, since all hits will be considered ambiguous.
140
 */
141
demux_node_vec
142
build_demux_tree(const sequence_pair_vec& barcodes)
39✔
143
{
144
  // Step 1: Construct list of merged, sorted barcodes barcodes;
145
  //         this allows construction of the sparse tree in one pass.
146
  const barcode_vec sorted_barcodes = sort_barcodes(barcodes);
39✔
147

148
  // Step 2: Create empty tree containing just the root node; creating
149
  //         the root here simplifies the 'add_sequence_to_tree' function.
150
  demux_node_vec tree;
33✔
151
  tree.emplace_back();
33✔
152

153
  // Step 3: Add each barcode to the tree, in sorted order
154
  for (const auto& pair : sorted_barcodes) {
394✔
155
    add_sequence_to_tree(tree, pair.first, pair.second);
67✔
156
  }
157

158
  return tree;
31✔
159
}
33✔
160

161
/** Converts sample set to list of barcodes in input order */
162
sequence_pair_vec
UNCOV
163
build_barcode_table(const sample_set& samples)
×
164
{
UNCOV
165
  sequence_pair_vec barcodes;
×
UNCOV
166
  for (const auto& sample : samples) {
×
UNCOV
167
    for (const auto& it : sample) {
×
UNCOV
168
      barcodes.emplace_back(it.barcode_1, it.barcode_2);
×
169
    }
170
  }
171

UNCOV
172
  return barcodes;
×
UNCOV
173
}
×
174

175
} // namespace
176

177
///////////////////////////////////////////////////////////////////////////////
178

UNCOV
179
barcode_table::barcode_table(const sample_set& samples,
×
180
                             size_t max_mm,
181
                             size_t max_mm_r1,
UNCOV
182
                             size_t max_mm_r2)
×
UNCOV
183
  : barcode_table(build_barcode_table(samples), max_mm, max_mm_r1, max_mm_r2)
×
184
{
185
}
186

187
barcode_table::barcode_table(const sequence_pair_vec& barcodes,
40✔
188
                             size_t max_mm,
189
                             size_t max_mm_r1,
190
                             size_t max_mm_r2)
88✔
191
{
192
  m_max_mismatches = std::min<size_t>(max_mm, max_mm_r1 + max_mm_r2);
80✔
193
  m_max_mismatches_r1 = std::min<size_t>(m_max_mismatches, max_mm_r1);
80✔
194
  m_max_mismatches_r2 = std::min<size_t>(m_max_mismatches, max_mm_r2);
80✔
195

196
  if (!barcodes.empty()) {
80✔
197
    m_nodes = build_demux_tree(barcodes);
101✔
198
    m_barcode_1_len = barcodes.front().first.length();
93✔
199
    m_barcode_2_len = barcodes.front().second.length();
93✔
200
  }
201
}
40✔
202

203
int
204
barcode_table::identify(const fastq& read_r1) const
65✔
205
{
206
  if (read_r1.length() < m_barcode_1_len) {
130✔
207
    return barcode_table::no_match;
208
  }
209

210
  const std::string barcode = read_r1.sequence().substr(0, m_barcode_1_len);
126✔
211
  auto match = lookup(barcode.c_str(), 0, 0, nullptr);
126✔
212
  if (match.barcode == no_match && m_max_mismatches) {
63✔
213
    match = lookup_with_mm(
32✔
214
      barcode.c_str(), 0, m_max_mismatches, m_max_mismatches_r1, nullptr);
16✔
215
  }
216

217
  return match.barcode;
63✔
218
}
65✔
219

220
int
221
barcode_table::identify(const fastq& read_r1, const fastq& read_r2) const
96✔
222
{
223
  if (read_r1.length() < m_barcode_1_len ||
192✔
224
      read_r2.length() < m_barcode_2_len) {
184✔
225
    return no_match;
6✔
226
  }
227

228
  const auto barcode_1 = read_r1.sequence().substr(0, m_barcode_1_len);
180✔
229
  const auto barcode_2 = read_r2.sequence().substr(0, m_barcode_2_len);
180✔
230
  const auto combined_barcode = barcode_1 + barcode_2;
90✔
231

232
  auto match = lookup(combined_barcode.c_str(), 0, 0, nullptr);
180✔
233
  if (match.barcode == no_match && m_max_mismatches) {
90✔
234
    const next_subsequence next(barcode_2.c_str(), m_max_mismatches_r2);
70✔
235

236
    if (m_max_mismatches_r1) {
35✔
237
      match = lookup_with_mm(
54✔
238
        barcode_1.c_str(), 0, m_max_mismatches, m_max_mismatches_r1, &next);
27✔
239
    } else {
240
      match = lookup(barcode_1.c_str(), 0, m_max_mismatches, &next);
16✔
241
    }
242
  }
243

244
  return match.barcode;
90✔
245
}
276✔
246

247
barcode_table::candidate
248
barcode_table::lookup(const char* seq,
231✔
249
                      int parent,
250
                      const size_t max_global_mismatches,
251
                      const next_subsequence* next) const
252

253
{
254
  for (; *seq && parent != no_match; ++seq) {
1,149✔
255
    if (*seq == 'N') {
950✔
256
      return candidate(no_match);
64✔
257
    }
258

259
    const auto encoded_nuc = ACGT::to_index(*seq);
918✔
260
    const auto& node = m_nodes.at(parent);
1,836✔
261

262
    parent = node.children.at(encoded_nuc);
1,836✔
263
  }
264

265
  if (parent == no_match) {
199✔
266
    return candidate(no_match);
154✔
267
  } else if (next) {
122✔
268
    const auto max_local_mismatches =
17✔
269
      std::min(max_global_mismatches, next->max_local_mismatches);
34✔
270

271
    if (max_local_mismatches) {
17✔
272
      return lookup_with_mm(next->seq,
4✔
273
                            parent,
274
                            max_global_mismatches,
275
                            max_local_mismatches,
276
                            nullptr);
4✔
277
    } else {
278
      return lookup(next->seq, parent, max_global_mismatches, nullptr);
13✔
279
    }
280
  } else {
281
    const auto& node = m_nodes.at(parent);
210✔
282
    return candidate(node.value, m_max_mismatches - max_global_mismatches);
210✔
283
  }
284
}
285

286
barcode_table::candidate
287
barcode_table::lookup_with_mm(const char* seq,
208✔
288
                              int parent,
289
                              const size_t max_global_mismatches,
290
                              const size_t max_local_mismatches,
291
                              const next_subsequence* next) const
292

293
{
294
  const demultiplexer_node& node = m_nodes.at(parent);
416✔
295
  const auto nucleotide = *seq;
208✔
296

297
  if (nucleotide) {
208✔
298
    candidate best_candidate;
196✔
299

300
    for (size_t encoded_i = 0; encoded_i < 4; ++encoded_i) {
980✔
301
      const auto child = node.children.at(encoded_i);
1,568✔
302

303
      if (child != -1) {
784✔
304
        candidate current_candidate;
208✔
305

306
        if (nucleotide == ACGT::to_value(encoded_i)) {
208✔
307
          current_candidate = lookup_with_mm(
144✔
308
            seq + 1, child, max_global_mismatches, max_local_mismatches, next);
309
        } else if (max_local_mismatches == 1) {
64✔
310
          current_candidate =
55✔
311
            lookup(seq + 1, child, max_global_mismatches - 1, next);
110✔
312
        } else if (max_local_mismatches) {
9✔
313
          current_candidate = lookup_with_mm(seq + 1,
18✔
314
                                             child,
315
                                             max_global_mismatches - 1,
9✔
316
                                             max_local_mismatches - 1,
317
                                             next);
318
        }
319

320
        if (current_candidate.mismatches < best_candidate.mismatches) {
208✔
321
          best_candidate = current_candidate;
144✔
322
        } else if (current_candidate.mismatches == best_candidate.mismatches &&
64✔
323
                   current_candidate.barcode != no_match) {
59✔
324
          best_candidate.barcode = ambiguous;
6✔
325
        }
326
      }
327
    }
328

329
    return best_candidate;
196✔
330
  } else if (next) {
12✔
331
    const size_t next_max_local_mismatches =
10✔
332
      std::min(max_global_mismatches, next->max_local_mismatches);
20✔
333

334
    if (next_max_local_mismatches) {
10✔
335
      return lookup_with_mm(next->seq,
8✔
336
                            parent,
337
                            max_global_mismatches,
338
                            next_max_local_mismatches,
339
                            nullptr);
8✔
340
    } else {
341
      return lookup(next->seq, parent, max_global_mismatches, nullptr);
2✔
342
    }
343
  } else {
344
    return candidate(node.value, m_max_mismatches - max_global_mismatches);
4✔
345
  }
346
}
347

348
} // namespace adapterremoval
599✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc