• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #89

13 Apr 2025 09:57AM UTC coverage: 27.054% (-0.4%) from 27.414%
#89

push

travis-ci

web-flow
report individual barcode counts when multiple barcodes are used (#116)

* fix sample percentages in per-sample HTML table
* count per-barcode hits rather than per sample
* report individual barcode counts in JSON report
* report individual barcode counts in HTML report
* stack barcodes in demultiplexing barplot and identify in tooltip
* adds button to show/hide multiple barcodes

0 of 74 new or added lines in 4 files covered. (0.0%)

626 existing lines in 4 files now uncovered.

2723 of 10065 relevant lines covered (27.05%)

4005.83 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/statistics.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
3
// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
4
// SPDX-FileCopyrightText: 2010-17 Simon Andrews
5
#include "statistics.hpp"
6
#include "adapter_id.hpp" // for adapter_id_statistics
7
#include "debug.hpp"      // for AR_REQUIRE
8
#include "fastq.hpp"      // for ACGTN, fastq, ACGT
9
#include "fastq_enc.hpp"  // for PHRED_OFFSET_MIN, PHRED_SCORE_MAX
10
#include "simd.hpp"       // for size_t
11
#include "utilities.hpp"  // for prng_seed
12
#include <algorithm>      // for max, min
13
#include <cstdlib>        // for size_t
14
#include <memory>         // for make_shared, __shared_ptr_access, __shared_...
15
#include <string>         // for basic_string, string
16
#include <string_view>    // for string_view
17

18
namespace adapterremoval {
19

20
////////////////////////////////////////////////////////////////////////////////
21

22
const size_t DUPLICATION_LEVELS = 16;
23
const string_vec DUPLICATION_LABELS = { "1",    "2",   "3",   "4",
24
                                        "5",    "6",   "7",   "8",
25
                                        "9",    ">10", ">50", ">100",
26
                                        ">500", ">1k", ">5k", ">10k" };
27

28
size_t
29
duplication_level(size_t count)
×
30
{
31
  if (count > 10000) {
×
32
    return 15;
33
  } else if (count > 5000) {
×
34
    return 14;
35
  } else if (count > 1000) {
×
36
    return 13;
37
  } else if (count > 500) {
×
38
    return 12;
39
  } else if (count > 100) {
×
40
    return 11;
41
  } else if (count > 50) {
×
42
    return 10;
43
  } else if (count > 10) {
×
44
    return 9;
45
  } else {
46
    return count - 1;
×
47
  }
48
}
49

50
duplication_statistics::summary::summary()
×
51
  : labels(DUPLICATION_LABELS)
×
52
  , total_sequences(DUPLICATION_LEVELS)
×
53
  , unique_sequences(DUPLICATION_LEVELS)
×
54
  , unique_frac()
×
55
{
56
}
57

58
duplication_statistics::duplication_statistics(size_t max_unique_sequences)
×
59
  : m_max_unique_sequences(max_unique_sequences)
×
60
  , m_sequence_counts()
×
61
  , m_sequences_counted()
×
62
  , m_sequences_counted_at_max()
×
63
{
64
  m_sequence_counts.reserve(max_unique_sequences);
×
65
}
66

67
void
68
duplication_statistics::process(const fastq& read)
×
69
{
70
  if (m_max_unique_sequences) {
×
71
    if (read.length() > 75) {
×
72
      insert(read.sequence().substr(0, 50));
×
73
    } else {
74
      insert(read.sequence());
×
75
    }
76
  }
77
}
78

79
duplication_statistics::summary
80
duplication_statistics::summarize() const
×
81
{
82
  // Histogram of sequence duplication counts
83
  robin_hood::unordered_flat_map<size_t, size_t> histogram;
×
84
  for (const auto& it : m_sequence_counts) {
×
85
    histogram[it.second]++;
×
86
  }
87

88
  duplication_statistics::summary result;
×
89

90
  result.total_sequences.resize_up_to(DUPLICATION_LEVELS);
×
91
  result.unique_sequences.resize_up_to(DUPLICATION_LEVELS);
×
92

93
  for (const auto& it : histogram) {
×
94
    const auto level = it.first;
×
95
    const auto count = correct_count(level, it.second);
×
96

97
    const auto slot = duplication_level(level);
×
98
    result.total_sequences.inc(slot, count * level);
×
99
    result.unique_sequences.inc(slot, count);
×
100
  }
101

102
  const auto unique_count = result.unique_sequences.sum();
×
103
  const auto total_count = result.total_sequences.sum();
×
104

105
  if (total_count) {
×
106
    result.unique_frac = unique_count / static_cast<double>(total_count);
×
107
    result.total_sequences = result.total_sequences / total_count;
×
108
    result.unique_sequences = result.unique_sequences / unique_count;
×
109
  } else {
110
    result.unique_frac = 1.0;
×
111
  }
112

113
  return result;
×
114
}
115

116
double
117
duplication_statistics::correct_count(size_t bin, size_t count) const
×
118
{
119
  // Adapted from
120
  //   https://github.com/s-andrews/FastQC/blob/v0.11.9/uk/ac/babraham/FastQC/Modules/DuplicationLevel.java#L158
121

122
  // See if we can bail out early
123
  if (m_sequences_counted_at_max == m_sequences_counted) {
×
124
    return count;
×
125
  }
126

127
  // If there aren't enough sequences left to hide another sequence with this
128
  // count then we can also skip the calculation
129
  if (m_sequences_counted - count < m_sequences_counted_at_max) {
×
130
    return count;
×
131
  }
132

133
  // If not then we need to see what the likelihood is that we had another
134
  // sequence with this number of observations which we would have missed.
135

136
  // We'll start by working out the probability of NOT seeing a sequence with
137
  // this duplication level within the first countAtLimit sequences of
138
  // count.  This is easier than calculating the probability of
139
  // seeing it.
140
  double p_not_seeing_at_limit = 1.0;
×
141

142
  // To save doing long calculations which are never going to produce anything
143
  // meaningful we'll set a limit to our p-value calculation.  This is the
144
  // probability below which we won't increase our count by 0.01 of an
145
  // observation.  Once we're below this we stop caring about the corrected
146
  // value since it's going to be so close to the observed value that we can
147
  // just return that instead.
148
  const double limit_of_caring = 1.0 - (count / (count + 0.01));
×
149

150
  for (size_t i = 0; i < m_sequences_counted_at_max; i++) {
×
151
    p_not_seeing_at_limit *= ((m_sequences_counted - i) - bin) /
×
152
                             static_cast<double>(m_sequences_counted - i);
153

154
    if (p_not_seeing_at_limit < limit_of_caring) {
×
155
      p_not_seeing_at_limit = 0;
156
      break;
157
    }
158
  }
159

160
  // Scale by the chance of seeing
161
  return count / (1 - p_not_seeing_at_limit);
×
162
}
163

164
void
165
duplication_statistics::insert(const std::string& key)
×
166
{
167
  m_sequences_counted++;
×
168

169
  const auto unique_seqs = m_sequence_counts.size();
×
170
  if (unique_seqs >= m_max_unique_sequences) {
×
171
    auto it = m_sequence_counts.find(key);
×
172
    if (it != m_sequence_counts.end()) {
×
173
      it->second++;
×
174
    }
175
  } else {
176
    m_sequence_counts[key]++;
×
177
    m_sequences_counted_at_max++;
×
178
  }
179
}
180

181
////////////////////////////////////////////////////////////////////////////////
182

183
namespace {
184

185
void
186
smoothed_gc_count(rates& distribution, size_t count, size_t length)
×
187
{
188
  // counts are smoothed across adjacent (percentage) bins
189
  const auto lower = std::max<double>(0, 100 * (count - 0.5) / length);
×
190
  const auto upper = std::min<double>(100, 100 * (count + 0.5) / length);
×
191

192
  const size_t lower_i = lower;
×
193
  const size_t upper_i = upper;
×
194

195
  const auto lower_f = lower - lower_i;
×
196
  const auto upper_f = upper - upper_i;
×
197

198
  if (lower_i == upper_i) {
×
199
    // Count falls in a single bin
200
    distribution.inc(lower_i, (upper - lower));
×
201
  } else {
202
    // Increment first bin/partially overlapped adjacent bins
203
    distribution.inc(lower_i, (1 - lower_f));
×
204
    distribution.inc(upper_i, upper_f);
×
205

206
    // Ranges are half open, except for the final bin
207
    const auto final_i = count == length ? 101 : upper_i;
×
208
    for (size_t i = lower + 1; i < final_i; ++i) {
×
209
      distribution.inc(i);
×
210
    }
211
  }
212
}
213

214
} // namespace
215

216
fastq_statistics::fastq_statistics(double sample_rate)
×
217
  : fastq_statistics(sample_rate, prng_seed())
×
218
{
219
}
220

221
fastq_statistics::fastq_statistics(double sample_rate, uint32_t seed)
×
222
  : m_sample_rate(sample_rate)
×
223
  , m_rng(seed)
×
224
  , m_quality_dist(PHRED_SCORE_MAX + 1)
×
225
  , m_gc_content_dist(101)
×
226
{
227
}
228

229
void
230
fastq_statistics::process(const fastq& read, size_t num_input_reads)
×
231
{
232
  m_number_of_input_reads += num_input_reads;
×
233
  m_number_of_output_reads++;
×
234

235
  const auto sequence_len = read.length();
×
236
  m_length_dist.resize_up_to(sequence_len + 1);
×
237
  m_length_dist.inc(sequence_len);
×
238

239
  if (std::generate_canonical<float, 32>(m_rng) <= m_sample_rate) {
×
240
    m_nucleotide_pos.resize_up_to(sequence_len);
×
241
    m_quality_pos.resize_up_to(sequence_len);
×
242

243
    m_number_of_sampled_reads++;
×
244

245
    const std::string_view sequence = read.sequence();
×
246
    const std::string_view qualities = read.qualities();
×
247

248
    indexed_count<ACGTN> nucls;
×
249
    for (size_t i = 0; i < sequence_len; ++i) {
×
250
      const auto nuc = sequence[i];
×
251

252
      nucls.inc(nuc);
×
253
      m_nucleotide_pos.inc_unsafe(nuc, i);
×
254

255
      const auto quality = qualities[i] - PHRED_OFFSET_MIN;
×
256
      m_quality_pos.inc_unsafe(nuc, i, quality);
×
257
      m_quality_dist.inc(quality);
×
258
    }
259

260
    auto n_at = nucls.get('A') + nucls.get('T');
×
261
    auto n_gc = nucls.get('G') + nucls.get('C');
×
262
    if (n_at || n_gc) {
×
263
      // uncalled bases are not considered for the length of the reads
264
      smoothed_gc_count(m_gc_content_dist, n_gc, n_gc + n_at);
×
265
    }
266
  }
267
}
268

269
fastq_statistics&
270
fastq_statistics::operator+=(const fastq_statistics& other)
×
271
{
272
  m_number_of_input_reads += other.m_number_of_input_reads;
×
273
  m_number_of_output_reads += other.m_number_of_output_reads;
×
274
  m_number_of_sampled_reads += other.m_number_of_sampled_reads;
×
275
  m_length_dist += other.m_length_dist;
×
276
  m_quality_dist += other.m_quality_dist;
×
277
  m_gc_content_dist += other.m_gc_content_dist;
×
278
  m_nucleotide_pos += other.m_nucleotide_pos;
×
279
  m_quality_pos += other.m_quality_pos;
×
280

281
  return *this;
×
282
}
283

284
trimming_statistics::trimming_statistics(double sample_rate)
×
285
  : singleton(std::make_shared<fastq_statistics>(sample_rate))
×
286
  , merged(std::make_shared<fastq_statistics>(sample_rate))
×
287
  , discarded(std::make_shared<fastq_statistics>(sample_rate))
×
288
{
289
  // Synchronize sampling of mate 1 and mate 2 reads
290
  const auto seed = prng_seed();
×
291
  read_1 = std::make_shared<fastq_statistics>(sample_rate, seed);
×
292
  read_2 = std::make_shared<fastq_statistics>(sample_rate, seed);
×
293
}
294

295
trimming_statistics&
296
trimming_statistics::operator+=(const trimming_statistics& other)
×
297
{
298
  *read_1 += *other.read_1;
×
299
  *read_2 += *other.read_2;
×
300
  *singleton += *other.singleton;
×
301
  *merged += *other.merged;
×
302
  *discarded += *other.discarded;
×
303

304
  insert_sizes += other.insert_sizes;
×
305
  adapter_trimmed_reads += other.adapter_trimmed_reads;
×
306
  adapter_trimmed_bases += other.adapter_trimmed_bases;
×
307
  reads_merged += other.reads_merged;
×
308
  terminal_pre_trimmed += other.terminal_pre_trimmed;
×
309
  terminal_post_trimmed += other.terminal_post_trimmed;
×
310
  poly_x_pre_trimmed_reads += other.poly_x_pre_trimmed_reads;
×
311
  poly_x_pre_trimmed_bases += other.poly_x_pre_trimmed_bases;
×
312
  poly_x_post_trimmed_reads += other.poly_x_post_trimmed_reads;
×
313
  poly_x_post_trimmed_bases += other.poly_x_post_trimmed_bases;
×
314
  low_quality_trimmed += other.low_quality_trimmed;
×
315
  total_trimmed += other.total_trimmed;
×
316
  filtered_min_length += other.filtered_min_length;
×
317
  filtered_max_length += other.filtered_max_length;
×
318
  filtered_ambiguous += other.filtered_ambiguous;
×
319
  filtered_mean_quality += other.filtered_mean_quality;
×
320
  filtered_low_complexity += other.filtered_low_complexity;
×
321

322
  return *this;
×
323
}
324

325
demux_statistics::demux_statistics(double sample_rate)
×
326
  : unidentified_stats_1(std::make_shared<fastq_statistics>(sample_rate))
×
327
  , unidentified_stats_2(std::make_shared<fastq_statistics>(sample_rate))
×
328
{
329
}
330

331
size_t
332
demux_statistics::total() const
×
333
{
334
  size_t total = unidentified + ambiguous;
×
NEW
335
  for (const auto& count : samples) {
×
NEW
336
    total += count.sum();
×
337
  }
338

339
  return total;
×
340
}
341

342
statistics::statistics(double sample_rate)
×
343
  : input_1(std::make_shared<fastq_statistics>(sample_rate))
×
344
  , input_2(std::make_shared<fastq_statistics>(sample_rate))
×
345
  , demultiplexing(std::make_shared<demux_statistics>(sample_rate))
×
346
{
347
}
348

349
statistics_builder::statistics_builder()
×
350
  : m_sample_count(0)
×
351
  , m_sample_rate(1.0)
×
352
  , m_max_unique(0)
×
353
  , m_adapter_id(0)
×
354
{
355
}
356

357
statistics_builder&
358
statistics_builder::sample_rate(double rate)
×
359
{
360
  m_sample_rate = rate;
×
361

362
  return *this;
×
363
}
364

365
statistics_builder&
366
statistics_builder::demultiplexing(size_t samples)
×
367
{
368
  m_sample_count = samples;
×
369

370
  return *this;
×
371
}
372

373
statistics_builder&
374
statistics_builder::estimate_duplication(size_t max_unique)
×
375
{
376
  m_max_unique = max_unique;
×
377

378
  return *this;
×
379
}
380

381
/** Enable adapter identification */
382
statistics_builder&
383
statistics_builder::adapter_identification(size_t max_length)
×
384
{
385
  m_adapter_id = max_length;
×
386

387
  return *this;
×
388
}
389

390
statistics
391
statistics_builder::initialize() const
×
392
{
393
  statistics stats(m_sample_rate);
×
394

395
  stats.duplication_1 = std::make_shared<duplication_statistics>(m_max_unique);
×
396
  stats.duplication_2 = std::make_shared<duplication_statistics>(m_max_unique);
×
397

398
  if (m_adapter_id) {
×
399
    stats.adapter_id = std::make_shared<adapter_id_statistics>(m_adapter_id);
×
400
  }
401

402
  return stats;
×
403
}
×
404

405
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc