#117

Committed 25 May 2025 03:01PM UTC coverage: 66.932% (-0.07%) from 67.006%

Build # #117

Build Type

push

travis-ci

Committed by

web-flow

Commit Message

iwyu and reduce build-time inter-dependencies (#144)

Run Details

26 of 145 new or added lines in 20 files covered. (17.93%)

89 existing lines in 5 files now uncovered.

9738 of 14549 relevant lines covered (66.93%)

3041.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/statistics.cpp

// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
// SPDX-FileCopyrightText: 2010-17 Simon Andrews
#include "statistics.hpp" // declarations
#include "adapter_id.hpp" // for adapter_id_statistics
#include "fastq.hpp"      // for ACGTN, fastq, ACGT
#include "fastq_enc.hpp"  // for PHRED_OFFSET_MIN, PHRED_SCORE_MAX
#include "robin_hood.hpp" // for unordered_flat_map
#include "utilities.hpp"  // for prng_seed
#include <algorithm>      // for max, min
#include <array>          // for array
#include <cstdlib>        // for size_t
#include <functional>     // for equal_to
#include <memory>         // for make_shared, __shared_ptr_access, __shared_...
#include <string>         // for basic_string, string
#include <string_view>    // for string_view

namespace adapterremoval {

////////////////////////////////////////////////////////////////////////////////

const size_t DUPLICATION_LEVELS = 16;
const std::array<std::string_view, DUPLICATION_LEVELS> DUPLICATION_LABELS = {
  "1", "2",   "3",   "4",    "5",    "6",   "7",   "8",
  "9", ">10", ">50", ">100", ">500", ">1k", ">5k", ">10k"
};

namespace {

size_t
duplication_level(size_t count)
{
  if (count > 10000) {
    return 15;
  } else if (count > 5000) {
    return 14;
  } else if (count > 1000) {
    return 13;
  } else if (count > 500) {
    return 12;
  } else if (count > 100) {
    return 11;
  } else if (count > 50) {
    return 10;
  } else if (count > 10) {
    return 9;
  } else {
    return count - 1;
  }
}

} // namespace

class string_counts
{
  using map = robin_hood::unordered_flat_map<std::string, size_t>;

public:
  explicit string_counts(size_t max_unique_sequences)
    : m_max_unique_sequences(max_unique_sequences)
  {
    m_counts.reserve(max_unique_sequences);
  }

  void add(const std::string& key)
  {
    m_counted++;
    m_counted_at_max++;

    const auto unique_seqs = m_counts.size();
    if (unique_seqs >= m_max_unique_sequences) {
      auto it = m_counts.find(key);
      if (it != m_counts.end()) {
        it->second++;
      }
    } else {
      m_counts[key]++;
    }
  }

  [[nodiscard]] auto counted() const { return m_counted; }

  [[nodiscard]] auto counted_at_max() const { return m_counted_at_max; }

  [[nodiscard]] auto begin() const { return m_counts.begin(); }

  [[nodiscard]] auto end() const { return m_counts.end(); }

private:
  /** Maximum size of m_sequence_counts. */
  size_t m_max_unique_sequences{};
  /** Total number of sequences counted */
  size_t m_counted{};
  /** Number of sequences counted after max unique sequences recorded */
  size_t m_counted_at_max{};
  /** Map of truncated sequences to sequence counts. */
  map m_counts{};
};

duplication_statistics::summary::summary()
  : labels(DUPLICATION_LABELS.begin(), DUPLICATION_LABELS.end())
  , total_sequences(DUPLICATION_LEVELS)
  , unique_sequences(DUPLICATION_LEVELS)
  , unique_frac()
{
}

duplication_statistics::duplication_statistics(size_t max_unique_sequences)
  : m_sequence_counts()
{
  if (max_unique_sequences) {
    m_sequence_counts = std::make_unique<string_counts>(max_unique_sequences);
  }
}

duplication_statistics::~duplication_statistics()
{
  m_sequence_counts.reset();
}

void
duplication_statistics::process(const fastq& read)
{
  if (m_sequence_counts) {
    if (read.length() > 75) {
      m_sequence_counts->add(read.sequence().substr(0, 50));
    } else {
      m_sequence_counts->add(read.sequence());
    }
  }
}

duplication_statistics::summary
duplication_statistics::summarize() const
{
  // Histogram of sequence duplication counts
  robin_hood::unordered_flat_map<size_t, size_t> histogram;
  for (const auto& it : *m_sequence_counts) {
    histogram[it.second]++;
  }

  duplication_statistics::summary result;

  result.total_sequences.resize_up_to(DUPLICATION_LEVELS);
  result.unique_sequences.resize_up_to(DUPLICATION_LEVELS);

  for (const auto& it : histogram) {
    const auto level = it.first;
    const auto count = correct_count(level, it.second);

    const auto slot = duplication_level(level);
    result.total_sequences.inc(slot, count * level);
    result.unique_sequences.inc(slot, count);
  }

  const auto unique_count = result.unique_sequences.sum();
  const auto total_count = result.total_sequences.sum();

  if (total_count) {
    result.unique_frac = unique_count / static_cast<double>(total_count);
    result.total_sequences = result.total_sequences / total_count;
    result.unique_sequences = result.unique_sequences / unique_count;
  } else {
    result.unique_frac = 1.0;
  }

  return result;
}

double
duplication_statistics::correct_count(size_t bin, size_t count) const
{
  // Adapted from
  //   https://github.com/s-andrews/FastQC/blob/v0.11.9/uk/ac/babraham/FastQC/Modules/DuplicationLevel.java#L158

  const auto sequences_counted = m_sequence_counts->counted();
  const auto sequences_counted_at_max = m_sequence_counts->counted_at_max();

  // See if we can bail out early
  if (sequences_counted_at_max == sequences_counted) {
    return count;
  }

  // If there aren't enough sequences left to hide another sequence with this
  // count then we can also skip the calculation
  if (sequences_counted - count < sequences_counted_at_max) {
    return count;
  }

  // If not then we need to see what the likelihood is that we had another
  // sequence with this number of observations which we would have missed.

  // We'll start by working out the probability of NOT seeing a sequence with
  // this duplication level within the first countAtLimit sequences of
  // count.  This is easier than calculating the probability of
  // seeing it.
  double p_not_seeing_at_limit = 1.0;

  // To save doing long calculations which are never going to produce anything
  // meaningful we'll set a limit to our p-value calculation.  This is the
  // probability below which we won't increase our count by 0.01 of an
  // observation.  Once we're below this we stop caring about the corrected
  // value since it's going to be so close to the observed value that we can
  // just return that instead.
  const double limit_of_caring = 1.0 - (count / (count + 0.01));

  for (size_t i = 0; i < sequences_counted_at_max; i++) {
    p_not_seeing_at_limit *= ((sequences_counted - i) - bin) /
                             static_cast<double>(sequences_counted - i);

    if (p_not_seeing_at_limit < limit_of_caring) {
      p_not_seeing_at_limit = 0;
      break;
    }
  }

  // Scale by the chance of seeing
  return count / (1 - p_not_seeing_at_limit);
}

////////////////////////////////////////////////////////////////////////////////

namespace {

void
smoothed_gc_count(rates& distribution, size_t count, size_t length)
{
  // counts are smoothed across adjacent (percentage) bins
  const auto lower = std::max<double>(0, 100 * (count - 0.5) / length);
  const auto upper = std::min<double>(100, 100 * (count + 0.5) / length);

  const size_t lower_i = lower;
  const size_t upper_i = upper;

  const auto lower_f = lower - lower_i;
  const auto upper_f = upper - upper_i;

  if (lower_i == upper_i) {
    // Count falls in a single bin
    distribution.inc(lower_i, (upper - lower));
  } else {
    // Increment first bin/partially overlapped adjacent bins
    distribution.inc(lower_i, (1 - lower_f));
    distribution.inc(upper_i, upper_f);

    // Ranges are half open, except for the final bin
    const auto final_i = count == length ? 101 : upper_i;
    for (size_t i = lower + 1; i < final_i; ++i) {
      distribution.inc(i);
    }
  }
}

} // namespace

fastq_statistics::fastq_statistics(double sample_rate)
  : fastq_statistics(sample_rate, prng_seed())
{
}

fastq_statistics::fastq_statistics(double sample_rate, uint32_t seed)
  : m_sample_rate(sample_rate)
  , m_rng(seed)
  , m_quality_dist(PHRED_SCORE_MAX + 1)
  , m_gc_content_dist(101)
{
}

void
fastq_statistics::process(const fastq& read, size_t num_input_reads)
{
  m_number_of_input_reads += num_input_reads;
  m_number_of_output_reads++;

  const auto sequence_len = read.length();
  m_length_dist.resize_up_to(sequence_len + 1);
  m_length_dist.inc(sequence_len);

  if (std::generate_canonical<float, 32>(m_rng) <= m_sample_rate) {
    m_nucleotide_pos.resize_up_to(sequence_len);
    m_quality_pos.resize_up_to(sequence_len);

    m_number_of_sampled_reads++;

    const std::string_view sequence = read.sequence();
    const std::string_view qualities = read.qualities();

    indexed_count<ACGTN> nucls;
    for (size_t i = 0; i < sequence_len; ++i) {
      const auto nuc = sequence[i];

      nucls.inc(nuc);
      m_nucleotide_pos.inc_unsafe(nuc, i);

      const auto quality = qualities[i] - PHRED_OFFSET_MIN;
      m_quality_pos.inc_unsafe(nuc, i, quality);
      m_quality_dist.inc(quality);
    }

    auto n_at = nucls.get('A') + nucls.get('T');
    auto n_gc = nucls.get('G') + nucls.get('C');
    if (n_at || n_gc) {
      // uncalled bases are not considered for the length of the reads
      smoothed_gc_count(m_gc_content_dist, n_gc, n_gc + n_at);
    }
  }
}

fastq_statistics&
fastq_statistics::operator+=(const fastq_statistics& other)
{
  m_number_of_input_reads += other.m_number_of_input_reads;
  m_number_of_output_reads += other.m_number_of_output_reads;
  m_number_of_sampled_reads += other.m_number_of_sampled_reads;
  m_length_dist += other.m_length_dist;
  m_quality_dist += other.m_quality_dist;
  m_gc_content_dist += other.m_gc_content_dist;
  m_nucleotide_pos += other.m_nucleotide_pos;
  m_quality_pos += other.m_quality_pos;

  return *this;
}

trimming_statistics::trimming_statistics(double sample_rate)
  : singleton(std::make_shared<fastq_statistics>(sample_rate))
  , merged(std::make_shared<fastq_statistics>(sample_rate))
  , discarded(std::make_shared<fastq_statistics>(sample_rate))
{
  // Synchronize sampling of mate 1 and mate 2 reads
  const auto seed = prng_seed();
  read_1 = std::make_shared<fastq_statistics>(sample_rate, seed);
  read_2 = std::make_shared<fastq_statistics>(sample_rate, seed);
}

trimming_statistics&
trimming_statistics::operator+=(const trimming_statistics& other)
{
  *read_1 += *other.read_1;
  *read_2 += *other.read_2;
  *singleton += *other.singleton;
  *merged += *other.merged;
  *discarded += *other.discarded;

  insert_sizes += other.insert_sizes;
  adapter_trimmed_reads += other.adapter_trimmed_reads;
  adapter_trimmed_bases += other.adapter_trimmed_bases;
  reads_merged += other.reads_merged;
  terminal_pre_trimmed += other.terminal_pre_trimmed;
  terminal_post_trimmed += other.terminal_post_trimmed;
  poly_x_pre_trimmed_reads += other.poly_x_pre_trimmed_reads;
  poly_x_pre_trimmed_bases += other.poly_x_pre_trimmed_bases;
  poly_x_post_trimmed_reads += other.poly_x_post_trimmed_reads;
  poly_x_post_trimmed_bases += other.poly_x_post_trimmed_bases;
  low_quality_trimmed += other.low_quality_trimmed;
  total_trimmed += other.total_trimmed;
  filtered_min_length += other.filtered_min_length;
  filtered_max_length += other.filtered_max_length;
  filtered_ambiguous += other.filtered_ambiguous;
  filtered_mean_quality += other.filtered_mean_quality;
  filtered_low_complexity += other.filtered_low_complexity;

  return *this;
}

demux_statistics::demux_statistics(double sample_rate)
  : unidentified_stats_1(std::make_shared<fastq_statistics>(sample_rate))
  , unidentified_stats_2(std::make_shared<fastq_statistics>(sample_rate))
{
}

size_t
demux_statistics::total() const
{
  size_t total = unidentified + ambiguous;
  for (const auto& count : samples) {
    total += count.sum();
  }

  return total;
}

statistics::statistics(double sample_rate)
  : input_1(std::make_shared<fastq_statistics>(sample_rate))
  , input_2(std::make_shared<fastq_statistics>(sample_rate))
  , demultiplexing(std::make_shared<demux_statistics>(sample_rate))
{
}

statistics_builder::statistics_builder()
  : m_sample_count(0)
  , m_sample_rate(1.0)
  , m_max_unique(0)
  , m_adapter_id(0)
{
}

statistics_builder&
statistics_builder::sample_rate(double rate)
{
  m_sample_rate = rate;

  return *this;
}

statistics_builder&
statistics_builder::demultiplexing(size_t samples)
{
  m_sample_count = samples;

  return *this;
}

statistics_builder&
statistics_builder::estimate_duplication(size_t max_unique)
{
  m_max_unique = max_unique;

  return *this;
}

/** Enable adapter identification */
statistics_builder&
statistics_builder::adapter_identification(size_t max_length)
{
  m_adapter_id = max_length;

  return *this;
}

statistics
statistics_builder::initialize() const
{
  statistics stats(m_sample_rate);

  stats.duplication_1 = std::make_shared<duplication_statistics>(m_max_unique);
  stats.duplication_2 = std::make_shared<duplication_statistics>(m_max_unique);

  if (m_adapter_id) {
    stats.adapter_id = std::make_shared<adapter_id_statistics>(m_adapter_id);
  }

  return stats;
}

} // namespace adapterremoval

1	// SPDX-License-Identifier: GPL-3.0-or-later
2	// SPDX-FileCopyrightText: 2011 Stinus Lindgreen <stinus@binf.ku.dk>
3	// SPDX-FileCopyrightText: 2014 Mikkel Schubert <mikkelsch@gmail.com>
4	// SPDX-FileCopyrightText: 2010-17 Simon Andrews
5	#include "statistics.hpp" // declarations
6	#include "adapter_id.hpp" // for adapter_id_statistics
7	#include "fastq.hpp" // for ACGTN, fastq, ACGT
8	#include "fastq_enc.hpp" // for PHRED_OFFSET_MIN, PHRED_SCORE_MAX
9	#include "robin_hood.hpp" // for unordered_flat_map
10	#include "utilities.hpp" // for prng_seed
11	#include <algorithm> // for max, min
12	#include <array> // for array
13	#include <cstdlib> // for size_t
14	#include <functional> // for equal_to
15	#include <memory> // for make_shared, __shared_ptr_access, __shared_...
16	#include <string> // for basic_string, string
17	#include <string_view> // for string_view
18
19	namespace adapterremoval {
20
21	////////////////////////////////////////////////////////////////////////////////
22
23	const size_t DUPLICATION_LEVELS = 16;
24	const std::array<std::string_view, DUPLICATION_LEVELS> DUPLICATION_LABELS = {
25	"1", "2", "3", "4", "5", "6", "7", "8",
26	"9", ">10", ">50", ">100", ">500", ">1k", ">5k", ">10k"
27	};
28
29	namespace {
30
31	size_t
32	duplication_level(size_t count)	×
33	{
34	if (count > 10000) {	×
35	return 15;
36	} else if (count > 5000) {	×
37	return 14;
38	} else if (count > 1000) {	×
39	return 13;
40	} else if (count > 500) {	×
41	return 12;
42	} else if (count > 100) {	×
43	return 11;
44	} else if (count > 50) {	×
45	return 10;
46	} else if (count > 10) {	×
47	return 9;
48	} else {
49	return count - 1;	×
50	}
51	}
52
53	} // namespace
54
NEW 55	class string_counts	×
56	{
57	using map = robin_hood::unordered_flat_map<std::string, size_t>;
58
59	public:
NEW 60	explicit string_counts(size_t max_unique_sequences)	×
NEW 61	: m_max_unique_sequences(max_unique_sequences)	×
62	{
NEW 63	m_counts.reserve(max_unique_sequences);	×
64	}
65
NEW 66	void add(const std::string& key)	×
67	{
NEW 68	m_counted++;	×
NEW 69	m_counted_at_max++;	×
70
NEW 71	const auto unique_seqs = m_counts.size();	×
NEW 72	if (unique_seqs >= m_max_unique_sequences) {	×
NEW 73	auto it = m_counts.find(key);	×
NEW 74	if (it != m_counts.end()) {	×
NEW 75	it->second++;	×
76	}
77	} else {
NEW 78	m_counts[key]++;	×
79	}
80	}
81
NEW 82	[[nodiscard]] auto counted() const { return m_counted; }	×
83
NEW 84	[[nodiscard]] auto counted_at_max() const { return m_counted_at_max; }	×
85
NEW 86	[[nodiscard]] auto begin() const { return m_counts.begin(); }	×
87
NEW 88	[[nodiscard]] auto end() const { return m_counts.end(); }	×
89
90	private:
91	/** Maximum size of m_sequence_counts. */
92	size_t m_max_unique_sequences{};
93	/** Total number of sequences counted */
94	size_t m_counted{};
95	/** Number of sequences counted after max unique sequences recorded */
96	size_t m_counted_at_max{};
97	/** Map of truncated sequences to sequence counts. */
98	map m_counts{};
99	};
100
101	duplication_statistics::summary::summary()	×
NEW 102	: labels(DUPLICATION_LABELS.begin(), DUPLICATION_LABELS.end())	×
103	, total_sequences(DUPLICATION_LEVELS)	×
104	, unique_sequences(DUPLICATION_LEVELS)	×
105	, unique_frac()	×
106	{
107	}
108
109	duplication_statistics::duplication_statistics(size_t max_unique_sequences)	×
NEW 110	: m_sequence_counts()	×
111	{
NEW 112	if (max_unique_sequences) {	×
NEW 113	m_sequence_counts = std::make_unique<string_counts>(max_unique_sequences);	×
114	}
115	}
116
NEW 117	duplication_statistics::~duplication_statistics()	×
118	{
NEW 119	m_sequence_counts.reset();	×
120	}
121
122	void
123	duplication_statistics::process(const fastq& read)	×
124	{
NEW 125	if (m_sequence_counts) {	×
126	if (read.length() > 75) {	×
NEW 127	m_sequence_counts->add(read.sequence().substr(0, 50));	×
128	} else {
NEW 129	m_sequence_counts->add(read.sequence());	×
130	}
131	}
132	}
133
134	duplication_statistics::summary
135	duplication_statistics::summarize() const	×
136	{
137	// Histogram of sequence duplication counts
138	robin_hood::unordered_flat_map<size_t, size_t> histogram;	×
NEW 139	for (const auto& it : *m_sequence_counts) {	×
140	histogram[it.second]++;	×
141	}
142
143	duplication_statistics::summary result;	×
144
145	result.total_sequences.resize_up_to(DUPLICATION_LEVELS);	×
146	result.unique_sequences.resize_up_to(DUPLICATION_LEVELS);	×
147
148	for (const auto& it : histogram) {	×
149	const auto level = it.first;	×
150	const auto count = correct_count(level, it.second);	×
151
152	const auto slot = duplication_level(level);	×
153	result.total_sequences.inc(slot, count * level);	×
154	result.unique_sequences.inc(slot, count);	×
155	}
156
157	const auto unique_count = result.unique_sequences.sum();	×
158	const auto total_count = result.total_sequences.sum();	×
159
160	if (total_count) {	×
161	result.unique_frac = unique_count / static_cast<double>(total_count);	×
162	result.total_sequences = result.total_sequences / total_count;	×
163	result.unique_sequences = result.unique_sequences / unique_count;	×
164	} else {
165	result.unique_frac = 1.0;	×
166	}
167
168	return result;	×
169	}
170
171	double
172	duplication_statistics::correct_count(size_t bin, size_t count) const	×
173	{
174	// Adapted from
175	// https://github.com/s-andrews/FastQC/blob/v0.11.9/uk/ac/babraham/FastQC/Modules/DuplicationLevel.java#L158
176
NEW 177	const auto sequences_counted = m_sequence_counts->counted();	×
NEW 178	const auto sequences_counted_at_max = m_sequence_counts->counted_at_max();	×
179
180	// See if we can bail out early
NEW 181	if (sequences_counted_at_max == sequences_counted) {	×
182	return count;	×
183	}
184
185	// If there aren't enough sequences left to hide another sequence with this
186	// count then we can also skip the calculation
NEW 187	if (sequences_counted - count < sequences_counted_at_max) {	×
188	return count;	×
189	}
190
191	// If not then we need to see what the likelihood is that we had another
192	// sequence with this number of observations which we would have missed.
193
194	// We'll start by working out the probability of NOT seeing a sequence with
195	// this duplication level within the first countAtLimit sequences of
196	// count. This is easier than calculating the probability of
197	// seeing it.
198	double p_not_seeing_at_limit = 1.0;	×
199
200	// To save doing long calculations which are never going to produce anything
201	// meaningful we'll set a limit to our p-value calculation. This is the
202	// probability below which we won't increase our count by 0.01 of an
203	// observation. Once we're below this we stop caring about the corrected
204	// value since it's going to be so close to the observed value that we can
205	// just return that instead.
206	const double limit_of_caring = 1.0 - (count / (count + 0.01));	×
207
NEW 208	for (size_t i = 0; i < sequences_counted_at_max; i++) {	×
NEW 209	p_not_seeing_at_limit *= ((sequences_counted - i) - bin) /	×
210	static_cast<double>(sequences_counted - i);
211
212	if (p_not_seeing_at_limit < limit_of_caring) {	×
213	p_not_seeing_at_limit = 0;
214	break;
215	}
216	}
217
218	// Scale by the chance of seeing
219	return count / (1 - p_not_seeing_at_limit);	×
220	}
221
222	////////////////////////////////////////////////////////////////////////////////
223
224	namespace {
225
226	void
227	smoothed_gc_count(rates& distribution, size_t count, size_t length)	×
228	{
229	// counts are smoothed across adjacent (percentage) bins
230	const auto lower = std::max<double>(0, 100 * (count - 0.5) / length);	×
231	const auto upper = std::min<double>(100, 100 * (count + 0.5) / length);	×
232
233	const size_t lower_i = lower;	×
234	const size_t upper_i = upper;	×
235
236	const auto lower_f = lower - lower_i;	×
237	const auto upper_f = upper - upper_i;	×
238
239	if (lower_i == upper_i) {	×
240	// Count falls in a single bin
241	distribution.inc(lower_i, (upper - lower));	×
242	} else {
243	// Increment first bin/partially overlapped adjacent bins
244	distribution.inc(lower_i, (1 - lower_f));	×
245	distribution.inc(upper_i, upper_f);	×
246
247	// Ranges are half open, except for the final bin
248	const auto final_i = count == length ? 101 : upper_i;	×
249	for (size_t i = lower + 1; i < final_i; ++i) {	×
250	distribution.inc(i);	×
251	}
252	}
253	}
254
255	} // namespace
256
257	fastq_statistics::fastq_statistics(double sample_rate)	×
258	: fastq_statistics(sample_rate, prng_seed())	×
259	{
260	}
261
262	fastq_statistics::fastq_statistics(double sample_rate, uint32_t seed)	×
263	: m_sample_rate(sample_rate)	×
264	, m_rng(seed)	×
265	, m_quality_dist(PHRED_SCORE_MAX + 1)	×
266	, m_gc_content_dist(101)	×
267	{
268	}
269
270	void
271	fastq_statistics::process(const fastq& read, size_t num_input_reads)	×
272	{
273	m_number_of_input_reads += num_input_reads;	×
274	m_number_of_output_reads++;	×
275
276	const auto sequence_len = read.length();	×
277	m_length_dist.resize_up_to(sequence_len + 1);	×
278	m_length_dist.inc(sequence_len);	×
279
280	if (std::generate_canonical<float, 32>(m_rng) <= m_sample_rate) {	×
281	m_nucleotide_pos.resize_up_to(sequence_len);	×
282	m_quality_pos.resize_up_to(sequence_len);	×
283
284	m_number_of_sampled_reads++;	×
285
286	const std::string_view sequence = read.sequence();	×
287	const std::string_view qualities = read.qualities();	×
288
289	indexed_count<ACGTN> nucls;	×
290	for (size_t i = 0; i < sequence_len; ++i) {	×
291	const auto nuc = sequence[i];	×
292
293	nucls.inc(nuc);	×
294	m_nucleotide_pos.inc_unsafe(nuc, i);	×
295
296	const auto quality = qualities[i] - PHRED_OFFSET_MIN;	×
297	m_quality_pos.inc_unsafe(nuc, i, quality);	×
298	m_quality_dist.inc(quality);	×
299	}
300
301	auto n_at = nucls.get('A') + nucls.get('T');	×
302	auto n_gc = nucls.get('G') + nucls.get('C');	×
303	if (n_at \|\| n_gc) {	×
304	// uncalled bases are not considered for the length of the reads
305	smoothed_gc_count(m_gc_content_dist, n_gc, n_gc + n_at);	×
306	}
307	}
308	}
309
310	fastq_statistics&
311	fastq_statistics::operator+=(const fastq_statistics& other)	×
312	{
313	m_number_of_input_reads += other.m_number_of_input_reads;	×
314	m_number_of_output_reads += other.m_number_of_output_reads;	×
315	m_number_of_sampled_reads += other.m_number_of_sampled_reads;	×
316	m_length_dist += other.m_length_dist;	×
317	m_quality_dist += other.m_quality_dist;	×
318	m_gc_content_dist += other.m_gc_content_dist;	×
319	m_nucleotide_pos += other.m_nucleotide_pos;	×
320	m_quality_pos += other.m_quality_pos;	×
321
322	return *this;	×
323	}
324
325	trimming_statistics::trimming_statistics(double sample_rate)	×
326	: singleton(std::make_shared<fastq_statistics>(sample_rate))	×
327	, merged(std::make_shared<fastq_statistics>(sample_rate))	×
328	, discarded(std::make_shared<fastq_statistics>(sample_rate))	×
329	{
330	// Synchronize sampling of mate 1 and mate 2 reads
331	const auto seed = prng_seed();	×
332	read_1 = std::make_shared<fastq_statistics>(sample_rate, seed);	×
333	read_2 = std::make_shared<fastq_statistics>(sample_rate, seed);	×
334	}
335
336	trimming_statistics&
337	trimming_statistics::operator+=(const trimming_statistics& other)	×
338	{
339	read_1 += other.read_1;	×
340	read_2 += other.read_2;	×
341	singleton += other.singleton;	×
342	merged += other.merged;	×
343	discarded += other.discarded;	×
344
345	insert_sizes += other.insert_sizes;	×
346	adapter_trimmed_reads += other.adapter_trimmed_reads;	×
347	adapter_trimmed_bases += other.adapter_trimmed_bases;	×
348	reads_merged += other.reads_merged;	×
349	terminal_pre_trimmed += other.terminal_pre_trimmed;	×
350	terminal_post_trimmed += other.terminal_post_trimmed;	×
351	poly_x_pre_trimmed_reads += other.poly_x_pre_trimmed_reads;	×
352	poly_x_pre_trimmed_bases += other.poly_x_pre_trimmed_bases;	×
353	poly_x_post_trimmed_reads += other.poly_x_post_trimmed_reads;	×
354	poly_x_post_trimmed_bases += other.poly_x_post_trimmed_bases;	×
355	low_quality_trimmed += other.low_quality_trimmed;	×
356	total_trimmed += other.total_trimmed;	×
357	filtered_min_length += other.filtered_min_length;	×
358	filtered_max_length += other.filtered_max_length;	×
359	filtered_ambiguous += other.filtered_ambiguous;	×
360	filtered_mean_quality += other.filtered_mean_quality;	×
361	filtered_low_complexity += other.filtered_low_complexity;	×
362
363	return *this;	×
364	}
365
366	demux_statistics::demux_statistics(double sample_rate)	×
367	: unidentified_stats_1(std::make_shared<fastq_statistics>(sample_rate))	×
368	, unidentified_stats_2(std::make_shared<fastq_statistics>(sample_rate))	×
369	{
370	}
371
372	size_t
373	demux_statistics::total() const	×
374	{
375	size_t total = unidentified + ambiguous;	×
376	for (const auto& count : samples) {	×
377	total += count.sum();	×
378	}
379
380	return total;	×
381	}
382
383	statistics::statistics(double sample_rate)	×
384	: input_1(std::make_shared<fastq_statistics>(sample_rate))	×
385	, input_2(std::make_shared<fastq_statistics>(sample_rate))	×
386	, demultiplexing(std::make_shared<demux_statistics>(sample_rate))	×
387	{
388	}
389
390	statistics_builder::statistics_builder()	×
391	: m_sample_count(0)	×
392	, m_sample_rate(1.0)	×
393	, m_max_unique(0)	×
394	, m_adapter_id(0)	×
395	{
396	}
397
398	statistics_builder&
399	statistics_builder::sample_rate(double rate)	×
400	{
401	m_sample_rate = rate;	×
402
403	return *this;	×
404	}
405
406	statistics_builder&
407	statistics_builder::demultiplexing(size_t samples)	×
408	{
409	m_sample_count = samples;	×
410
411	return *this;	×
412	}
413
414	statistics_builder&
415	statistics_builder::estimate_duplication(size_t max_unique)	×
416	{
417	m_max_unique = max_unique;	×
418
419	return *this;	×
420	}
421
422	/** Enable adapter identification */
423	statistics_builder&
424	statistics_builder::adapter_identification(size_t max_length)	×
425	{
426	m_adapter_id = max_length;	×
427
428	return *this;	×
429	}
430
431	statistics
432	statistics_builder::initialize() const	×
433	{
434	statistics stats(m_sample_rate);	×
435
436	stats.duplication_1 = std::make_shared<duplication_statistics>(m_max_unique);	×
437	stats.duplication_2 = std::make_shared<duplication_statistics>(m_max_unique);	×
438
439	if (m_adapter_id) {	×
440	stats.adapter_id = std::make_shared<adapter_id_statistics>(m_adapter_id);	×
441	}
442
443	return stats;	×
444	}	×
445
446	} // namespace adapterremoval

MikkelSchubert / adapterremoval / #117

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous