• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #46

27 Nov 2024 03:10PM UTC coverage: 27.245% (+1.0%) from 26.244%
#46

push

travis-ci

MikkelSchubert
fix convenience executable make target

2609 of 9576 relevant lines covered (27.25%)

4268.73 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.2
/src/barcode_table.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "barcode_table.hpp" // declarations
21
#include "debug.hpp"         // for AR_REQUIRE
22
#include "errors.hpp"        // for parsing_error
23
#include "fastq.hpp"         // for fastq
24
#include "sequence_sets.hpp" // for sample_set
25
#include <algorithm>         // for min, max, sort
26
#include <utility>           // for pair
27

28
namespace adapterremoval {
29

30
struct barcode_match
31
{
32
  barcode_match() = default;
109✔
33

34
  explicit barcode_match(const barcode_key& key_, uint32_t mismatches_ = -1)
107✔
35
    : key(key_)
107✔
36
    , mismatches(mismatches_)
107✔
37
  {
38
  }
39

40
  barcode_key key{};
41
  uint32_t mismatches = -1;
42
};
43

44
struct next_subsequence
45
{
46
  explicit next_subsequence(const char* seq_,
47
                            const size_t max_local_mismatches_)
48
    : seq(seq_)
49
    , max_local_mismatches(max_local_mismatches_)
50
  {
51
  }
52

53
  const char* seq;
54
  const size_t max_local_mismatches;
55
};
56

57
///////////////////////////////////////////////////////////////////////////////
58

59
barcode_node::barcode_node()
465✔
60
  : children()
1,860✔
61
  , key()
465✔
62
{
63
  children.fill(barcode_key::unidentified);
465✔
64
}
465✔
65

66
///////////////////////////////////////////////////////////////////////////////
67

68
namespace {
69

70
/** Adds a nucleotide sequence with a given ID to a quad-tree. */
71
void
72
add_sequence_to_tree(std::vector<barcode_node>& tree,
63✔
73
                     const std::string& sequence,
74
                     const barcode_key key)
75
{
76
  size_t node_idx = 0;
63✔
77
  bool added_last_node = false;
63✔
78
  for (auto nuc : sequence) {
1,728✔
79
    auto& node = tree.at(node_idx);
492✔
80
    // Indicate when PE barcodes can be unambiguously identified from SE reads
81
    if (node.key.sample == barcode_key::unidentified) {
492✔
82
      node.key.sample = key.sample;
402✔
83
      node.key.barcode = key.barcode;
402✔
84
    } else if (node.key.sample == key.sample) {
90✔
85
      node.key.barcode = barcode_key::ambiguous;
×
86
    } else {
87
      node.key.sample = barcode_key::ambiguous;
90✔
88
      node.key.barcode = barcode_key::ambiguous;
90✔
89
    }
90

91
    const auto nuc_idx = ACGT::to_index(nuc);
492✔
92
    auto child = node.children[nuc_idx];
492✔
93

94
    added_last_node = (child == barcode_key::unidentified);
492✔
95
    if (added_last_node) {
492✔
96
      // New nodes are added to the end of the list; as barcodes are added in
97
      // lexicographic order, this helps ensure that a set of similar barcodes
98
      // will be placed in mostly contiguous runs of the vector representation.
99
      child = node.children[nuc_idx] = tree.size();
868✔
100
      tree.emplace_back();
434✔
101
    }
102

103
    node_idx = child;
492✔
104
  }
105

106
  if (!added_last_node) {
63✔
107
    throw parsing_error(std::string("duplicate barcode(s): ") + sequence);
×
108
  }
109

110
  tree.at(node_idx).key = key;
63✔
111
}
63✔
112

113
barcode_vec
114
build_barcode_vec(const sample_set& samples)
31✔
115
{
116
  int32_t sample_i = 0;
31✔
117
  barcode_vec barcodes;
31✔
118
  for (const auto& sample : samples) {
313✔
119
    AR_REQUIRE(!sample.name().empty());
189✔
120

121
    int32_t barcode_i = 0;
63✔
122
    for (const auto& sequences : sample) {
504✔
123
      barcodes.emplace_back(
63✔
124
        sequence_pair{ sequences.barcode_1, sequences.barcode_2 },
126✔
125
        barcode_key{ sample_i, barcode_i++ });
126✔
126
    }
127

128
    sample_i++;
63✔
129
  }
130

131
  std::sort(barcodes.begin(), barcodes.end());
93✔
132

133
  return barcodes;
31✔
134
}
×
135

136
} // namespace
137

138
///////////////////////////////////////////////////////////////////////////////
139

140
barcode_table::barcode_table(const sample_set& samples,
31✔
141
                             size_t max_mm,
142
                             size_t max_mm_r1,
143
                             size_t max_mm_r2)
62✔
144
{
145
  AR_REQUIRE(samples.size());
62✔
146
  m_max_mismatches = std::min<size_t>(max_mm, max_mm_r1 + max_mm_r2);
62✔
147
  m_max_mismatches_r1 = std::min<size_t>(m_max_mismatches, max_mm_r1);
62✔
148
  m_max_mismatches_r2 = std::min<size_t>(m_max_mismatches, max_mm_r2);
62✔
149

150
  // Flatten and lexigraphically sort barcodes to simplify tree building
151
  auto barcodes = build_barcode_vec(samples);
31✔
152

153
  // Create empty tree containing just the root node; creating the root here
154
  // simplifies the 'add_sequence_to_tree' function.
155
  m_nodes.emplace_back();
31✔
156

157
  const auto& front = barcodes.front().first;
31✔
158
  m_barcode_1_len = front.first.length();
62✔
159
  m_barcode_2_len = front.second.length();
62✔
160

161
  // Step 3: Add each barcode to the tree, in sorted order
162
  for (const auto& [sequences, key] : barcodes) {
502✔
163
    AR_REQUIRE(m_barcode_1_len && sequences.first.length() == m_barcode_1_len &&
189✔
164
               sequences.second.length() == m_barcode_2_len);
165

166
    std::string barcode;
63✔
167
    barcode.reserve(m_barcode_1_len + m_barcode_2_len);
63✔
168
    barcode.append(sequences.first);
63✔
169
    barcode.append(sequences.second);
63✔
170

171
    add_sequence_to_tree(m_nodes, barcode, key);
63✔
172
  }
63✔
173
}
62✔
174

175
barcode_key
176
barcode_table::identify(const fastq& read_r1) const
65✔
177
{
178
  if (read_r1.length() < m_barcode_1_len) {
130✔
179
    return barcode_key{};
2✔
180
  }
181

182
  const std::string barcode = read_r1.sequence().substr(0, m_barcode_1_len);
126✔
183
  auto match = lookup(barcode.c_str(), 0, 0, nullptr);
126✔
184
  if (match.key.sample == barcode_key::unidentified && m_max_mismatches_r1) {
63✔
185
    match = lookup_with_mm(
32✔
186
      barcode.c_str(), 0, m_max_mismatches_r1, m_max_mismatches_r1, nullptr);
16✔
187
  }
188

189
  return match.key;
63✔
190
}
65✔
191

192
barcode_key
193
barcode_table::identify(const fastq& read_r1, const fastq& read_r2) const
96✔
194
{
195
  if (read_r1.length() < m_barcode_1_len ||
192✔
196
      read_r2.length() < m_barcode_2_len) {
184✔
197
    return barcode_key{};
6✔
198
  }
199

200
  const auto barcode_1 = read_r1.sequence().substr(0, m_barcode_1_len);
180✔
201
  const auto barcode_2 = read_r2.sequence().substr(0, m_barcode_2_len);
180✔
202

203
  auto match = lookup(barcode_1.c_str(), 0, 0, barcode_2.c_str());
270✔
204
  if (match.key.sample == barcode_key::unidentified && m_max_mismatches) {
90✔
205
    if (m_max_mismatches_r1) {
35✔
206
      match = lookup_with_mm(barcode_1.c_str(),
54✔
207
                             0,
208
                             m_max_mismatches,
27✔
209
                             m_max_mismatches_r1,
27✔
210
                             barcode_2.c_str());
211
    } else {
212
      match = lookup(barcode_1.c_str(), 0, m_max_mismatches, barcode_2.c_str());
24✔
213
    }
214
  }
215

216
  return match.key;
90✔
217
}
186✔
218

219
barcode_match
220
barcode_table::lookup(const char* seq,
285✔
221
                      int32_t parent,
222
                      const size_t max_global_mismatches,
223
                      const char* next) const
224
{
225
  for (; *seq && parent != barcode_key::unidentified; ++seq) {
1,203✔
226
    if (*seq == 'N') {
950✔
227
      return {};
64✔
228
    }
229

230
    const auto encoded_nuc = ACGT::to_index(*seq);
918✔
231
    const auto& node = m_nodes.at(parent);
918✔
232

233
    parent = node.children.at(encoded_nuc);
1,836✔
234
  }
235

236
  if (parent == barcode_key::unidentified) {
253✔
237
    return {};
154✔
238
  } else if (next) {
176✔
239
    const auto max_local_mismatches =
71✔
240
      std::min(max_global_mismatches, m_max_mismatches_r2);
142✔
241

242
    if (max_local_mismatches) {
71✔
243
      return lookup_with_mm(
4✔
244
        next, parent, max_global_mismatches, max_local_mismatches, nullptr);
4✔
245
    } else {
246
      return lookup(next, parent, max_global_mismatches, nullptr);
67✔
247
    }
248
  } else {
249
    const auto& node = m_nodes.at(parent);
105✔
250
    return barcode_match(node.key, m_max_mismatches - max_global_mismatches);
210✔
251
  }
252
}
253

254
barcode_match
255
barcode_table::lookup_with_mm(const char* seq,
208✔
256
                              int32_t parent,
257
                              const size_t max_global_mismatches,
258
                              size_t max_local_mismatches,
259
                              const char* next) const
260

261
{
262
  const barcode_node& node = m_nodes.at(parent);
208✔
263
  const auto nucleotide = *seq;
208✔
264

265
  if (nucleotide) {
208✔
266
    barcode_match best_candidate;
196✔
267

268
    for (size_t encoded_i = 0; encoded_i < 4; ++encoded_i) {
980✔
269
      const auto child = node.children.at(encoded_i);
1,568✔
270

271
      if (child != barcode_key::unidentified) {
784✔
272
        barcode_match current_candidate;
208✔
273

274
        // Nucleotide may be 'N', so test has to be done by converting the index
275
        if (nucleotide == ACGT::to_value(encoded_i)) {
208✔
276
          current_candidate = lookup_with_mm(
144✔
277
            seq + 1, child, max_global_mismatches, max_local_mismatches, next);
278
        } else if (max_local_mismatches == 1) {
64✔
279
          current_candidate =
55✔
280
            lookup(seq + 1, child, max_global_mismatches - 1, next);
110✔
281
        } else if (max_local_mismatches) {
9✔
282
          current_candidate = lookup_with_mm(seq + 1,
18✔
283
                                             child,
284
                                             max_global_mismatches - 1,
9✔
285
                                             max_local_mismatches - 1,
286
                                             next);
287
        }
288

289
        if (current_candidate.mismatches < best_candidate.mismatches) {
208✔
290
          best_candidate = current_candidate;
144✔
291
        } else if (current_candidate.mismatches == best_candidate.mismatches &&
64✔
292
                   current_candidate.key.sample != barcode_key::unidentified) {
59✔
293
          if (current_candidate.key.sample == best_candidate.key.sample) {
6✔
294
            best_candidate.key.barcode = barcode_key::ambiguous;
×
295
          } else {
296
            best_candidate.key.sample = barcode_key::ambiguous;
6✔
297
            best_candidate.key.barcode = barcode_key::ambiguous;
6✔
298
          }
299
        }
300
      }
301
    }
302

303
    return best_candidate;
196✔
304
  } else if (next) {
12✔
305
    max_local_mismatches = std::min(max_global_mismatches, m_max_mismatches_r2);
20✔
306

307
    if (max_local_mismatches) {
10✔
308
      return lookup_with_mm(
8✔
309
        next, parent, max_global_mismatches, max_local_mismatches, nullptr);
8✔
310
    } else {
311
      return lookup(next, parent, max_global_mismatches, nullptr);
2✔
312
    }
313
  } else {
314
    return barcode_match(node.key, m_max_mismatches - max_global_mismatches);
4✔
315
  }
316
}
317

318
} // namespace adapterremoval
577✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc