• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #45

20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%
#45

push

travis-ci

web-flow
attempt to fix coveralls run

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/statistics.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 * Copyright (C) 2010-17 by Simon Andrews                                *
7
 *                                                                       *
8
 * This program is free software: you can redistribute it and/or modify  *
9
 * it under the terms of the GNU General Public License as published by  *
10
 * the Free Software Foundation, either version 3 of the License, or     *
11
 * (at your option) any later version.                                   *
12
 *                                                                       *
13
 * This program is distributed in the hope that it will be useful,       *
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
16
 * GNU General Public License for more details.                          *
17
 *                                                                       *
18
 * You should have received a copy of the GNU General Public License     *
19
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
20
\*************************************************************************/
21
#include "statistics.hpp"
22
#include "adapter_id.hpp" // for adapter_id_statistics
23
#include "debug.hpp"      // for AR_REQUIRE
24
#include "fastq.hpp"      // for ACGTN, fastq, ACGT
25
#include "fastq_enc.hpp"  // for PHRED_OFFSET_MIN, PHRED_SCORE_MAX
26
#include "simd.hpp"       // for size_t
27
#include "utilities.hpp"  // for prng_seed
28
#include <algorithm>      // for max, min
29
#include <cstdlib>        // for size_t
30
#include <memory>         // for make_shared, __shared_ptr_access, __shared_...
31
#include <string>         // for basic_string, string
32
#include <string_view>    // for string_view
33

34
namespace adapterremoval {
35

36
////////////////////////////////////////////////////////////////////////////////
37

38
const size_t DUPLICATION_LEVELS = 16;
39
const string_vec DUPLICATION_LABELS = { "1",    "2",   "3",   "4",
40
                                        "5",    "6",   "7",   "8",
41
                                        "9",    ">10", ">50", ">100",
42
                                        ">500", ">1k", ">5k", ">10k" };
43

44
size_t
45
duplication_level(size_t count)
×
46
{
47
  if (count > 10000) {
×
48
    return 15;
49
  } else if (count > 5000) {
×
50
    return 14;
51
  } else if (count > 1000) {
×
52
    return 13;
53
  } else if (count > 500) {
×
54
    return 12;
55
  } else if (count > 100) {
×
56
    return 11;
57
  } else if (count > 50) {
×
58
    return 10;
59
  } else if (count > 10) {
×
60
    return 9;
61
  } else {
62
    return count - 1;
×
63
  }
64
}
65

66
duplication_statistics::summary::summary()
×
67
  : labels(DUPLICATION_LABELS)
×
68
  , total_sequences(DUPLICATION_LEVELS)
×
69
  , unique_sequences(DUPLICATION_LEVELS)
×
70
  , unique_frac()
×
71
{
72
}
73

74
duplication_statistics::duplication_statistics(size_t max_unique_sequences)
×
75
  : m_max_unique_sequences(max_unique_sequences)
×
76
  , m_sequence_counts()
×
77
  , m_sequences_counted()
×
78
  , m_sequences_counted_at_max()
×
79
{
80
  m_sequence_counts.reserve(max_unique_sequences);
×
81
}
82

83
void
84
duplication_statistics::process(const fastq& read)
×
85
{
86
  if (m_max_unique_sequences) {
×
87
    if (read.length() > 75) {
×
88
      insert(read.sequence().substr(0, 50));
×
89
    } else {
90
      insert(read.sequence());
×
91
    }
92
  }
93
}
94

95
duplication_statistics::summary
96
duplication_statistics::summarize() const
×
97
{
98
  // Histogram of sequence duplication counts
99
  robin_hood::unordered_flat_map<size_t, size_t> histogram;
×
100
  for (const auto& it : m_sequence_counts) {
×
101
    histogram[it.second]++;
×
102
  }
103

104
  duplication_statistics::summary result;
×
105

106
  result.total_sequences.resize_up_to(DUPLICATION_LEVELS);
×
107
  result.unique_sequences.resize_up_to(DUPLICATION_LEVELS);
×
108

109
  for (const auto& it : histogram) {
×
110
    const auto level = it.first;
×
111
    const auto count = correct_count(level, it.second);
×
112

113
    const auto slot = duplication_level(level);
×
114
    result.total_sequences.inc(slot, count * level);
×
115
    result.unique_sequences.inc(slot, count);
×
116
  }
117

118
  const auto unique_count = result.unique_sequences.sum();
×
119
  const auto total_count = result.total_sequences.sum();
×
120

121
  if (total_count) {
×
122
    result.unique_frac = unique_count / static_cast<double>(total_count);
×
123
    result.total_sequences = result.total_sequences / total_count;
×
124
    result.unique_sequences = result.unique_sequences / unique_count;
×
125
  } else {
126
    result.unique_frac = 1.0;
×
127
  }
128

129
  return result;
×
130
}
131

132
double
133
duplication_statistics::correct_count(size_t bin, size_t count) const
×
134
{
135
  // Adapted from
136
  //   https://github.com/s-andrews/FastQC/blob/v0.11.9/uk/ac/babraham/FastQC/Modules/DuplicationLevel.java#L158
137

138
  // See if we can bail out early
139
  if (m_sequences_counted_at_max == m_sequences_counted) {
×
140
    return count;
×
141
  }
142

143
  // If there aren't enough sequences left to hide another sequence with this
144
  // count then we can also skip the calculation
145
  if (m_sequences_counted - count < m_sequences_counted_at_max) {
×
146
    return count;
×
147
  }
148

149
  // If not then we need to see what the likelihood is that we had another
150
  // sequence with this number of observations which we would have missed.
151

152
  // We'll start by working out the probability of NOT seeing a sequence with
153
  // this duplication level within the first countAtLimit sequences of
154
  // count.  This is easier than calculating the probability of
155
  // seeing it.
156
  double p_not_seeing_at_limit = 1.0;
×
157

158
  // To save doing long calculations which are never going to produce anything
159
  // meaningful we'll set a limit to our p-value calculation.  This is the
160
  // probability below which we won't increase our count by 0.01 of an
161
  // observation.  Once we're below this we stop caring about the corrected
162
  // value since it's going to be so close to the observed value that we can
163
  // just return that instead.
164
  const double limit_of_caring = 1.0 - (count / (count + 0.01));
×
165

166
  for (size_t i = 0; i < m_sequences_counted_at_max; i++) {
×
167
    p_not_seeing_at_limit *= ((m_sequences_counted - i) - bin) /
×
168
                             static_cast<double>(m_sequences_counted - i);
169

170
    if (p_not_seeing_at_limit < limit_of_caring) {
×
171
      p_not_seeing_at_limit = 0;
172
      break;
173
    }
174
  }
175

176
  // Scale by the chance of seeing
177
  return count / (1 - p_not_seeing_at_limit);
×
178
}
179

180
void
181
duplication_statistics::insert(const std::string& key)
×
182
{
183
  m_sequences_counted++;
×
184

185
  const auto unique_seqs = m_sequence_counts.size();
×
186
  if (unique_seqs >= m_max_unique_sequences) {
×
187
    auto it = m_sequence_counts.find(key);
×
188
    if (it != m_sequence_counts.end()) {
×
189
      it->second++;
×
190
    }
191
  } else {
192
    m_sequence_counts[key]++;
×
193
    m_sequences_counted_at_max++;
×
194
  }
195
}
196

197
////////////////////////////////////////////////////////////////////////////////
198

199
namespace {
200

201
void
202
smoothed_gc_count(rates& distribution, size_t count, size_t length)
×
203
{
204
  // counts are smoothed across adjacent (percentage) bins
205
  const auto lower = std::max<double>(0, 100 * (count - 0.5) / length);
×
206
  const auto upper = std::min<double>(100, 100 * (count + 0.5) / length);
×
207

208
  const size_t lower_i = lower;
×
209
  const size_t upper_i = upper;
×
210

211
  const auto lower_f = lower - lower_i;
×
212
  const auto upper_f = upper - upper_i;
×
213

214
  if (lower_i == upper_i) {
×
215
    // Count falls in a single bin
216
    distribution.inc(lower_i, (upper - lower));
×
217
  } else {
218
    // Increment first bin/partially overlapped adjacent bins
219
    distribution.inc(lower_i, (1 - lower_f));
×
220
    distribution.inc(upper_i, upper_f);
×
221

222
    // Ranges are half open, except for the final bin
223
    const auto final_i = count == length ? 101 : upper_i;
×
224
    for (size_t i = lower + 1; i < final_i; ++i) {
×
225
      distribution.inc(i);
×
226
    }
227
  }
228
}
229

230
} // namespace
231

232
fastq_statistics::fastq_statistics(double sample_rate)
×
233
  : fastq_statistics(sample_rate, prng_seed())
×
234
{
235
}
236

237
fastq_statistics::fastq_statistics(double sample_rate, uint32_t seed)
×
238
  : m_sample_rate(sample_rate)
×
239
  , m_rng(seed)
×
240
  , m_quality_dist(PHRED_SCORE_MAX + 1)
×
241
  , m_gc_content_dist(101)
×
242
{
243
}
244

245
void
246
fastq_statistics::process(const fastq& read, size_t num_input_reads)
×
247
{
248
  m_number_of_input_reads += num_input_reads;
×
249
  m_number_of_output_reads++;
×
250

251
  const auto sequence_len = read.length();
×
252
  m_length_dist.resize_up_to(sequence_len + 1);
×
253
  m_length_dist.inc(sequence_len);
×
254

255
  if (std::generate_canonical<float, 32>(m_rng) <= m_sample_rate) {
×
256
    m_nucleotide_pos.resize_up_to(sequence_len);
×
257
    m_quality_pos.resize_up_to(sequence_len);
×
258

259
    m_number_of_sampled_reads++;
×
260

261
    const std::string_view sequence = read.sequence();
×
262
    const std::string_view qualities = read.qualities();
×
263

264
    indexed_count<ACGTN> nucls;
×
265
    for (size_t i = 0; i < sequence_len; ++i) {
×
266
      const auto nuc = sequence[i];
×
267

268
      nucls.inc(nuc);
×
269
      m_nucleotide_pos.inc_unsafe(nuc, i);
×
270

271
      const auto quality = qualities[i] - PHRED_OFFSET_MIN;
×
272
      m_quality_pos.inc_unsafe(nuc, i, quality);
×
273
      m_quality_dist.inc(quality);
×
274
    }
275

276
    auto n_at = nucls.get('A') + nucls.get('T');
×
277
    auto n_gc = nucls.get('G') + nucls.get('C');
×
278
    if (n_at || n_gc) {
×
279
      // uncalled bases are not considered for the length of the reads
280
      smoothed_gc_count(m_gc_content_dist, n_gc, n_gc + n_at);
×
281
    }
282
  }
283

284
  if (m_duplication) {
×
285
    m_duplication->process(read);
×
286
  }
287
}
288

289
void
290
fastq_statistics::init_duplication_stats(size_t max_unique_sequences)
×
291
{
292
  AR_REQUIRE(!m_duplication);
×
293
  m_duplication =
×
294
    std::make_shared<duplication_statistics>(max_unique_sequences);
×
295
}
296

297
fastq_statistics&
298
fastq_statistics::operator+=(const fastq_statistics& other)
×
299
{
300
  m_number_of_input_reads += other.m_number_of_input_reads;
×
301
  m_number_of_output_reads += other.m_number_of_output_reads;
×
302
  m_number_of_sampled_reads += other.m_number_of_sampled_reads;
×
303
  m_length_dist += other.m_length_dist;
×
304
  m_quality_dist += other.m_quality_dist;
×
305
  m_gc_content_dist += other.m_gc_content_dist;
×
306
  m_nucleotide_pos += other.m_nucleotide_pos;
×
307
  m_quality_pos += other.m_quality_pos;
×
308

309
  return *this;
×
310
}
311

312
trimming_statistics::trimming_statistics(double sample_rate)
×
313
  : singleton(std::make_shared<fastq_statistics>(sample_rate))
×
314
  , merged(std::make_shared<fastq_statistics>(sample_rate))
×
315
  , discarded(std::make_shared<fastq_statistics>(sample_rate))
×
316
{
317
  // Synchronize sampling of mate 1 and mate 2 reads
318
  const auto seed = prng_seed();
×
319
  read_1 = std::make_shared<fastq_statistics>(sample_rate, seed);
×
320
  read_2 = std::make_shared<fastq_statistics>(sample_rate, seed);
×
321
}
322

323
trimming_statistics&
324
trimming_statistics::operator+=(const trimming_statistics& other)
×
325
{
326
  *read_1 += *other.read_1;
×
327
  *read_2 += *other.read_2;
×
328
  *singleton += *other.singleton;
×
329
  *merged += *other.merged;
×
330
  *discarded += *other.discarded;
×
331

332
  insert_sizes += other.insert_sizes;
×
333
  adapter_trimmed_reads += other.adapter_trimmed_reads;
×
334
  adapter_trimmed_bases += other.adapter_trimmed_bases;
×
335
  reads_merged += other.reads_merged;
×
336
  terminal_pre_trimmed += other.terminal_pre_trimmed;
×
337
  terminal_post_trimmed += other.terminal_post_trimmed;
×
338
  poly_x_pre_trimmed_reads += other.poly_x_pre_trimmed_reads;
×
339
  poly_x_pre_trimmed_bases += other.poly_x_pre_trimmed_bases;
×
340
  poly_x_post_trimmed_reads += other.poly_x_post_trimmed_reads;
×
341
  poly_x_post_trimmed_bases += other.poly_x_post_trimmed_bases;
×
342
  low_quality_trimmed += other.low_quality_trimmed;
×
343
  total_trimmed += other.total_trimmed;
×
344
  filtered_min_length += other.filtered_min_length;
×
345
  filtered_max_length += other.filtered_max_length;
×
346
  filtered_ambiguous += other.filtered_ambiguous;
×
347
  filtered_mean_quality += other.filtered_mean_quality;
×
348
  filtered_low_complexity += other.filtered_low_complexity;
×
349

350
  return *this;
×
351
}
352

353
demux_statistics::demux_statistics(double sample_rate)
×
354
  : unidentified_stats_1(std::make_shared<fastq_statistics>(sample_rate))
×
355
  , unidentified_stats_2(std::make_shared<fastq_statistics>(sample_rate))
×
356
{
357
}
358

359
size_t
360
demux_statistics::total() const
×
361
{
362
  size_t total = unidentified + ambiguous;
×
363
  for (const auto count : samples) {
×
364
    total += count;
×
365
  }
366

367
  return total;
×
368
}
369

370
statistics::statistics(double sample_rate)
×
371
  : input_1(std::make_shared<fastq_statistics>(sample_rate))
×
372
  , input_2(std::make_shared<fastq_statistics>(sample_rate))
×
373
  , demultiplexing(std::make_shared<demux_statistics>(sample_rate))
×
374
{
375
}
376

377
statistics_builder::statistics_builder()
×
378
  : m_sample_count(0)
×
379
  , m_sample_rate(1.0)
×
380
  , m_max_unique(0)
×
381
  , m_adapter_id(0)
×
382
{
383
}
384

385
statistics_builder&
386
statistics_builder::sample_rate(double rate)
×
387
{
388
  m_sample_rate = rate;
×
389

390
  return *this;
×
391
}
392

393
statistics_builder&
394
statistics_builder::demultiplexing(size_t samples)
×
395
{
396
  m_sample_count = samples;
×
397

398
  return *this;
×
399
}
400

401
statistics_builder&
402
statistics_builder::estimate_duplication(size_t max_unique)
×
403
{
404
  m_max_unique = max_unique;
×
405

406
  return *this;
×
407
}
408

409
/** Enable adapter identification */
410
statistics_builder&
411
statistics_builder::adapter_identification(size_t max_length)
×
412
{
413
  m_adapter_id = max_length;
×
414

415
  return *this;
×
416
}
417

418
statistics
419
statistics_builder::initialize() const
×
420
{
421
  statistics stats(m_sample_rate);
×
422
  if (m_sample_count) {
×
423
    stats.demultiplexing->samples.resize(m_sample_count);
×
424
  }
425

426
  stats.input_1->init_duplication_stats(m_max_unique);
×
427
  stats.input_2->init_duplication_stats(m_max_unique);
×
428

429
  if (m_adapter_id) {
×
430
    stats.adapter_id = std::make_shared<adapter_id_statistics>(m_adapter_id);
×
431
  }
432

433
  return stats;
×
434
}
×
435

436
} // namespace adapterremoval
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc