• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #117

25 May 2025 03:01PM UTC coverage: 66.932% (-0.07%) from 67.006%
#117

push

travis-ci

web-flow
iwyu and reduce build-time inter-dependencies (#144)

26 of 145 new or added lines in 20 files covered. (17.93%)

89 existing lines in 5 files now uncovered.

9738 of 14549 relevant lines covered (66.93%)

3041.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/statistics.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
3
// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
4
// SPDX-FileCopyrightText: 2010-17 Simon Andrews
5
#include "statistics.hpp" // declarations
6
#include "adapter_id.hpp" // for adapter_id_statistics
7
#include "fastq.hpp"      // for ACGTN, fastq, ACGT
8
#include "fastq_enc.hpp"  // for PHRED_OFFSET_MIN, PHRED_SCORE_MAX
9
#include "robin_hood.hpp" // for unordered_flat_map
10
#include "utilities.hpp"  // for prng_seed
11
#include <algorithm>      // for max, min
12
#include <array>          // for array
13
#include <cstdlib>        // for size_t
14
#include <functional>     // for equal_to
15
#include <memory>         // for make_shared, __shared_ptr_access, __shared_...
16
#include <string>         // for basic_string, string
17
#include <string_view>    // for string_view
18

19
namespace adapterremoval {
20

21
////////////////////////////////////////////////////////////////////////////////
22

23
const size_t DUPLICATION_LEVELS = 16;
24
const std::array<std::string_view, DUPLICATION_LEVELS> DUPLICATION_LABELS = {
25
  "1", "2",   "3",   "4",    "5",    "6",   "7",   "8",
26
  "9", ">10", ">50", ">100", ">500", ">1k", ">5k", ">10k"
27
};
28

29
namespace {
30

31
size_t
32
duplication_level(size_t count)
×
33
{
34
  if (count > 10000) {
×
35
    return 15;
36
  } else if (count > 5000) {
×
37
    return 14;
38
  } else if (count > 1000) {
×
39
    return 13;
40
  } else if (count > 500) {
×
41
    return 12;
42
  } else if (count > 100) {
×
43
    return 11;
44
  } else if (count > 50) {
×
45
    return 10;
46
  } else if (count > 10) {
×
47
    return 9;
48
  } else {
49
    return count - 1;
×
50
  }
51
}
52

53
} // namespace
54

NEW
55
class string_counts
×
56
{
57
  using map = robin_hood::unordered_flat_map<std::string, size_t>;
58

59
public:
NEW
60
  explicit string_counts(size_t max_unique_sequences)
×
NEW
61
    : m_max_unique_sequences(max_unique_sequences)
×
62
  {
NEW
63
    m_counts.reserve(max_unique_sequences);
×
64
  }
65

NEW
66
  void add(const std::string& key)
×
67
  {
NEW
68
    m_counted++;
×
NEW
69
    m_counted_at_max++;
×
70

NEW
71
    const auto unique_seqs = m_counts.size();
×
NEW
72
    if (unique_seqs >= m_max_unique_sequences) {
×
NEW
73
      auto it = m_counts.find(key);
×
NEW
74
      if (it != m_counts.end()) {
×
NEW
75
        it->second++;
×
76
      }
77
    } else {
NEW
78
      m_counts[key]++;
×
79
    }
80
  }
81

NEW
82
  [[nodiscard]] auto counted() const { return m_counted; }
×
83

NEW
84
  [[nodiscard]] auto counted_at_max() const { return m_counted_at_max; }
×
85

NEW
86
  [[nodiscard]] auto begin() const { return m_counts.begin(); }
×
87

NEW
88
  [[nodiscard]] auto end() const { return m_counts.end(); }
×
89

90
private:
91
  /** Maximum size of m_sequence_counts. */
92
  size_t m_max_unique_sequences{};
93
  /** Total number of sequences counted */
94
  size_t m_counted{};
95
  /** Number of sequences counted after max unique sequences recorded */
96
  size_t m_counted_at_max{};
97
  /** Map of truncated sequences to sequence counts. */
98
  map m_counts{};
99
};
100

101
duplication_statistics::summary::summary()
×
NEW
102
  : labels(DUPLICATION_LABELS.begin(), DUPLICATION_LABELS.end())
×
103
  , total_sequences(DUPLICATION_LEVELS)
×
104
  , unique_sequences(DUPLICATION_LEVELS)
×
105
  , unique_frac()
×
106
{
107
}
108

109
duplication_statistics::duplication_statistics(size_t max_unique_sequences)
×
NEW
110
  : m_sequence_counts()
×
111
{
NEW
112
  if (max_unique_sequences) {
×
NEW
113
    m_sequence_counts = std::make_unique<string_counts>(max_unique_sequences);
×
114
  }
115
}
116

NEW
117
duplication_statistics::~duplication_statistics()
×
118
{
NEW
119
  m_sequence_counts.reset();
×
120
}
121

122
void
123
duplication_statistics::process(const fastq& read)
×
124
{
NEW
125
  if (m_sequence_counts) {
×
126
    if (read.length() > 75) {
×
NEW
127
      m_sequence_counts->add(read.sequence().substr(0, 50));
×
128
    } else {
NEW
129
      m_sequence_counts->add(read.sequence());
×
130
    }
131
  }
132
}
133

134
duplication_statistics::summary
135
duplication_statistics::summarize() const
×
136
{
137
  // Histogram of sequence duplication counts
138
  robin_hood::unordered_flat_map<size_t, size_t> histogram;
×
NEW
139
  for (const auto& it : *m_sequence_counts) {
×
140
    histogram[it.second]++;
×
141
  }
142

143
  duplication_statistics::summary result;
×
144

145
  result.total_sequences.resize_up_to(DUPLICATION_LEVELS);
×
146
  result.unique_sequences.resize_up_to(DUPLICATION_LEVELS);
×
147

148
  for (const auto& it : histogram) {
×
149
    const auto level = it.first;
×
150
    const auto count = correct_count(level, it.second);
×
151

152
    const auto slot = duplication_level(level);
×
153
    result.total_sequences.inc(slot, count * level);
×
154
    result.unique_sequences.inc(slot, count);
×
155
  }
156

157
  const auto unique_count = result.unique_sequences.sum();
×
158
  const auto total_count = result.total_sequences.sum();
×
159

160
  if (total_count) {
×
161
    result.unique_frac = unique_count / static_cast<double>(total_count);
×
162
    result.total_sequences = result.total_sequences / total_count;
×
163
    result.unique_sequences = result.unique_sequences / unique_count;
×
164
  } else {
165
    result.unique_frac = 1.0;
×
166
  }
167

168
  return result;
×
169
}
170

171
double
172
duplication_statistics::correct_count(size_t bin, size_t count) const
×
173
{
174
  // Adapted from
175
  //   https://github.com/s-andrews/FastQC/blob/v0.11.9/uk/ac/babraham/FastQC/Modules/DuplicationLevel.java#L158
176

NEW
177
  const auto sequences_counted = m_sequence_counts->counted();
×
NEW
178
  const auto sequences_counted_at_max = m_sequence_counts->counted_at_max();
×
179

180
  // See if we can bail out early
NEW
181
  if (sequences_counted_at_max == sequences_counted) {
×
182
    return count;
×
183
  }
184

185
  // If there aren't enough sequences left to hide another sequence with this
186
  // count then we can also skip the calculation
NEW
187
  if (sequences_counted - count < sequences_counted_at_max) {
×
188
    return count;
×
189
  }
190

191
  // If not then we need to see what the likelihood is that we had another
192
  // sequence with this number of observations which we would have missed.
193

194
  // We'll start by working out the probability of NOT seeing a sequence with
195
  // this duplication level within the first countAtLimit sequences of
196
  // count.  This is easier than calculating the probability of
197
  // seeing it.
198
  double p_not_seeing_at_limit = 1.0;
×
199

200
  // To save doing long calculations which are never going to produce anything
201
  // meaningful we'll set a limit to our p-value calculation.  This is the
202
  // probability below which we won't increase our count by 0.01 of an
203
  // observation.  Once we're below this we stop caring about the corrected
204
  // value since it's going to be so close to the observed value that we can
205
  // just return that instead.
206
  const double limit_of_caring = 1.0 - (count / (count + 0.01));
×
207

NEW
208
  for (size_t i = 0; i < sequences_counted_at_max; i++) {
×
NEW
209
    p_not_seeing_at_limit *= ((sequences_counted - i) - bin) /
×
210
                             static_cast<double>(sequences_counted - i);
211

212
    if (p_not_seeing_at_limit < limit_of_caring) {
×
213
      p_not_seeing_at_limit = 0;
214
      break;
215
    }
216
  }
217

218
  // Scale by the chance of seeing
219
  return count / (1 - p_not_seeing_at_limit);
×
220
}
221

222
////////////////////////////////////////////////////////////////////////////////
223

224
namespace {
225

226
void
227
smoothed_gc_count(rates& distribution, size_t count, size_t length)
×
228
{
229
  // counts are smoothed across adjacent (percentage) bins
230
  const auto lower = std::max<double>(0, 100 * (count - 0.5) / length);
×
231
  const auto upper = std::min<double>(100, 100 * (count + 0.5) / length);
×
232

233
  const size_t lower_i = lower;
×
234
  const size_t upper_i = upper;
×
235

236
  const auto lower_f = lower - lower_i;
×
237
  const auto upper_f = upper - upper_i;
×
238

239
  if (lower_i == upper_i) {
×
240
    // Count falls in a single bin
241
    distribution.inc(lower_i, (upper - lower));
×
242
  } else {
243
    // Increment first bin/partially overlapped adjacent bins
244
    distribution.inc(lower_i, (1 - lower_f));
×
245
    distribution.inc(upper_i, upper_f);
×
246

247
    // Ranges are half open, except for the final bin
248
    const auto final_i = count == length ? 101 : upper_i;
×
249
    for (size_t i = lower + 1; i < final_i; ++i) {
×
250
      distribution.inc(i);
×
251
    }
252
  }
253
}
254

255
} // namespace
256

257
fastq_statistics::fastq_statistics(double sample_rate)
×
258
  : fastq_statistics(sample_rate, prng_seed())
×
259
{
260
}
261

262
fastq_statistics::fastq_statistics(double sample_rate, uint32_t seed)
×
263
  : m_sample_rate(sample_rate)
×
264
  , m_rng(seed)
×
265
  , m_quality_dist(PHRED_SCORE_MAX + 1)
×
266
  , m_gc_content_dist(101)
×
267
{
268
}
269

270
void
271
fastq_statistics::process(const fastq& read, size_t num_input_reads)
×
272
{
273
  m_number_of_input_reads += num_input_reads;
×
274
  m_number_of_output_reads++;
×
275

276
  const auto sequence_len = read.length();
×
277
  m_length_dist.resize_up_to(sequence_len + 1);
×
278
  m_length_dist.inc(sequence_len);
×
279

280
  if (std::generate_canonical<float, 32>(m_rng) <= m_sample_rate) {
×
281
    m_nucleotide_pos.resize_up_to(sequence_len);
×
282
    m_quality_pos.resize_up_to(sequence_len);
×
283

284
    m_number_of_sampled_reads++;
×
285

286
    const std::string_view sequence = read.sequence();
×
287
    const std::string_view qualities = read.qualities();
×
288

289
    indexed_count<ACGTN> nucls;
×
290
    for (size_t i = 0; i < sequence_len; ++i) {
×
291
      const auto nuc = sequence[i];
×
292

293
      nucls.inc(nuc);
×
294
      m_nucleotide_pos.inc_unsafe(nuc, i);
×
295

296
      const auto quality = qualities[i] - PHRED_OFFSET_MIN;
×
297
      m_quality_pos.inc_unsafe(nuc, i, quality);
×
298
      m_quality_dist.inc(quality);
×
299
    }
300

301
    auto n_at = nucls.get('A') + nucls.get('T');
×
302
    auto n_gc = nucls.get('G') + nucls.get('C');
×
303
    if (n_at || n_gc) {
×
304
      // uncalled bases are not considered for the length of the reads
305
      smoothed_gc_count(m_gc_content_dist, n_gc, n_gc + n_at);
×
306
    }
307
  }
308
}
309

310
fastq_statistics&
311
fastq_statistics::operator+=(const fastq_statistics& other)
×
312
{
313
  m_number_of_input_reads += other.m_number_of_input_reads;
×
314
  m_number_of_output_reads += other.m_number_of_output_reads;
×
315
  m_number_of_sampled_reads += other.m_number_of_sampled_reads;
×
316
  m_length_dist += other.m_length_dist;
×
317
  m_quality_dist += other.m_quality_dist;
×
318
  m_gc_content_dist += other.m_gc_content_dist;
×
319
  m_nucleotide_pos += other.m_nucleotide_pos;
×
320
  m_quality_pos += other.m_quality_pos;
×
321

322
  return *this;
×
323
}
324

325
trimming_statistics::trimming_statistics(double sample_rate)
×
326
  : singleton(std::make_shared<fastq_statistics>(sample_rate))
×
327
  , merged(std::make_shared<fastq_statistics>(sample_rate))
×
328
  , discarded(std::make_shared<fastq_statistics>(sample_rate))
×
329
{
330
  // Synchronize sampling of mate 1 and mate 2 reads
331
  const auto seed = prng_seed();
×
332
  read_1 = std::make_shared<fastq_statistics>(sample_rate, seed);
×
333
  read_2 = std::make_shared<fastq_statistics>(sample_rate, seed);
×
334
}
335

336
trimming_statistics&
337
trimming_statistics::operator+=(const trimming_statistics& other)
×
338
{
339
  *read_1 += *other.read_1;
×
340
  *read_2 += *other.read_2;
×
341
  *singleton += *other.singleton;
×
342
  *merged += *other.merged;
×
343
  *discarded += *other.discarded;
×
344

345
  insert_sizes += other.insert_sizes;
×
346
  adapter_trimmed_reads += other.adapter_trimmed_reads;
×
347
  adapter_trimmed_bases += other.adapter_trimmed_bases;
×
348
  reads_merged += other.reads_merged;
×
349
  terminal_pre_trimmed += other.terminal_pre_trimmed;
×
350
  terminal_post_trimmed += other.terminal_post_trimmed;
×
351
  poly_x_pre_trimmed_reads += other.poly_x_pre_trimmed_reads;
×
352
  poly_x_pre_trimmed_bases += other.poly_x_pre_trimmed_bases;
×
353
  poly_x_post_trimmed_reads += other.poly_x_post_trimmed_reads;
×
354
  poly_x_post_trimmed_bases += other.poly_x_post_trimmed_bases;
×
355
  low_quality_trimmed += other.low_quality_trimmed;
×
356
  total_trimmed += other.total_trimmed;
×
357
  filtered_min_length += other.filtered_min_length;
×
358
  filtered_max_length += other.filtered_max_length;
×
359
  filtered_ambiguous += other.filtered_ambiguous;
×
360
  filtered_mean_quality += other.filtered_mean_quality;
×
361
  filtered_low_complexity += other.filtered_low_complexity;
×
362

363
  return *this;
×
364
}
365

366
demux_statistics::demux_statistics(double sample_rate)
×
367
  : unidentified_stats_1(std::make_shared<fastq_statistics>(sample_rate))
×
368
  , unidentified_stats_2(std::make_shared<fastq_statistics>(sample_rate))
×
369
{
370
}
371

372
size_t
373
demux_statistics::total() const
×
374
{
375
  size_t total = unidentified + ambiguous;
×
376
  for (const auto& count : samples) {
×
377
    total += count.sum();
×
378
  }
379

380
  return total;
×
381
}
382

383
statistics::statistics(double sample_rate)
×
384
  : input_1(std::make_shared<fastq_statistics>(sample_rate))
×
385
  , input_2(std::make_shared<fastq_statistics>(sample_rate))
×
386
  , demultiplexing(std::make_shared<demux_statistics>(sample_rate))
×
387
{
388
}
389

390
statistics_builder::statistics_builder()
×
391
  : m_sample_count(0)
×
392
  , m_sample_rate(1.0)
×
393
  , m_max_unique(0)
×
394
  , m_adapter_id(0)
×
395
{
396
}
397

398
statistics_builder&
399
statistics_builder::sample_rate(double rate)
×
400
{
401
  m_sample_rate = rate;
×
402

403
  return *this;
×
404
}
405

406
statistics_builder&
407
statistics_builder::demultiplexing(size_t samples)
×
408
{
409
  m_sample_count = samples;
×
410

411
  return *this;
×
412
}
413

414
statistics_builder&
415
statistics_builder::estimate_duplication(size_t max_unique)
×
416
{
417
  m_max_unique = max_unique;
×
418

419
  return *this;
×
420
}
421

422
/** Enable adapter identification */
423
statistics_builder&
424
statistics_builder::adapter_identification(size_t max_length)
×
425
{
426
  m_adapter_id = max_length;
×
427

428
  return *this;
×
429
}
430

431
statistics
432
statistics_builder::initialize() const
×
433
{
434
  statistics stats(m_sample_rate);
×
435

436
  stats.duplication_1 = std::make_shared<duplication_statistics>(m_max_unique);
×
437
  stats.duplication_2 = std::make_shared<duplication_statistics>(m_max_unique);
×
438

439
  if (m_adapter_id) {
×
440
    stats.adapter_id = std::make_shared<adapter_id_statistics>(m_adapter_id);
×
441
  }
442

443
  return stats;
×
444
}
×
445

446
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc