MikkelSchubert / adapterremoval / #45

Committed 20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%

Build # #45

Build Type

push

travis-ci

Committed by

web-flow

Commit Message

attempt to fix coveralls run

Run Details

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/statistics.cpp

/*************************************************************************\
 * AdapterRemoval - cleaning next-generation sequencing reads            *
 *                                                                       *
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
 * Copyright (C) 2010-17 by Simon Andrews                                *
 *                                                                       *
 * This program is free software: you can redistribute it and/or modify  *
 * it under the terms of the GNU General Public License as published by  *
 * the Free Software Foundation, either version 3 of the License, or     *
 * (at your option) any later version.                                   *
 *                                                                       *
 * This program is distributed in the hope that it will be useful,       *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 * GNU General Public License for more details.                          *
 *                                                                       *
 * You should have received a copy of the GNU General Public License     *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
\*************************************************************************/
#include "statistics.hpp"
#include "adapter_id.hpp" // for adapter_id_statistics
#include "debug.hpp"      // for AR_REQUIRE
#include "fastq.hpp"      // for ACGTN, fastq, ACGT
#include "fastq_enc.hpp"  // for PHRED_OFFSET_MIN, PHRED_SCORE_MAX
#include "simd.hpp"       // for size_t
#include "utilities.hpp"  // for prng_seed
#include <algorithm>      // for max, min
#include <cstdlib>        // for size_t
#include <memory>         // for make_shared, __shared_ptr_access, __shared_...
#include <string>         // for basic_string, string
#include <string_view>    // for string_view

namespace adapterremoval {

////////////////////////////////////////////////////////////////////////////////

const size_t DUPLICATION_LEVELS = 16;
const string_vec DUPLICATION_LABELS = { "1",    "2",   "3",   "4",
                                        "5",    "6",   "7",   "8",
                                        "9",    ">10", ">50", ">100",
                                        ">500", ">1k", ">5k", ">10k" };

size_t
duplication_level(size_t count)
{
  if (count > 10000) {
    return 15;
  } else if (count > 5000) {
    return 14;
  } else if (count > 1000) {
    return 13;
  } else if (count > 500) {
    return 12;
  } else if (count > 100) {
    return 11;
  } else if (count > 50) {
    return 10;
  } else if (count > 10) {
    return 9;
  } else {
    return count - 1;
  }
}

duplication_statistics::summary::summary()
  : labels(DUPLICATION_LABELS)
  , total_sequences(DUPLICATION_LEVELS)
  , unique_sequences(DUPLICATION_LEVELS)
  , unique_frac()
{
}

duplication_statistics::duplication_statistics(size_t max_unique_sequences)
  : m_max_unique_sequences(max_unique_sequences)
  , m_sequence_counts()
  , m_sequences_counted()
  , m_sequences_counted_at_max()
{
  m_sequence_counts.reserve(max_unique_sequences);
}

void
duplication_statistics::process(const fastq& read)
{
  if (m_max_unique_sequences) {
    if (read.length() > 75) {
      insert(read.sequence().substr(0, 50));
    } else {
      insert(read.sequence());
    }
  }
}

duplication_statistics::summary
duplication_statistics::summarize() const
{
  // Histogram of sequence duplication counts
  robin_hood::unordered_flat_map<size_t, size_t> histogram;
  for (const auto& it : m_sequence_counts) {
    histogram[it.second]++;
  }

  duplication_statistics::summary result;

  result.total_sequences.resize_up_to(DUPLICATION_LEVELS);
  result.unique_sequences.resize_up_to(DUPLICATION_LEVELS);

  for (const auto& it : histogram) {
    const auto level = it.first;
    const auto count = correct_count(level, it.second);

    const auto slot = duplication_level(level);
    result.total_sequences.inc(slot, count * level);
    result.unique_sequences.inc(slot, count);
  }

  const auto unique_count = result.unique_sequences.sum();
  const auto total_count = result.total_sequences.sum();

  if (total_count) {
    result.unique_frac = unique_count / static_cast<double>(total_count);
    result.total_sequences = result.total_sequences / total_count;
    result.unique_sequences = result.unique_sequences / unique_count;
  } else {
    result.unique_frac = 1.0;
  }

  return result;
}

double
duplication_statistics::correct_count(size_t bin, size_t count) const
{
  // Adapted from
  //   https://github.com/s-andrews/FastQC/blob/v0.11.9/uk/ac/babraham/FastQC/Modules/DuplicationLevel.java#L158

  // See if we can bail out early
  if (m_sequences_counted_at_max == m_sequences_counted) {
    return count;
  }

  // If there aren't enough sequences left to hide another sequence with this
  // count then we can also skip the calculation
  if (m_sequences_counted - count < m_sequences_counted_at_max) {
    return count;
  }

  // If not then we need to see what the likelihood is that we had another
  // sequence with this number of observations which we would have missed.

  // We'll start by working out the probability of NOT seeing a sequence with
  // this duplication level within the first countAtLimit sequences of
  // count.  This is easier than calculating the probability of
  // seeing it.
  double p_not_seeing_at_limit = 1.0;

  // To save doing long calculations which are never going to produce anything
  // meaningful we'll set a limit to our p-value calculation.  This is the
  // probability below which we won't increase our count by 0.01 of an
  // observation.  Once we're below this we stop caring about the corrected
  // value since it's going to be so close to the observed value that we can
  // just return that instead.
  const double limit_of_caring = 1.0 - (count / (count + 0.01));

  for (size_t i = 0; i < m_sequences_counted_at_max; i++) {
    p_not_seeing_at_limit *= ((m_sequences_counted - i) - bin) /
                             static_cast<double>(m_sequences_counted - i);

    if (p_not_seeing_at_limit < limit_of_caring) {
      p_not_seeing_at_limit = 0;
      break;
    }
  }

  // Scale by the chance of seeing
  return count / (1 - p_not_seeing_at_limit);
}

void
duplication_statistics::insert(const std::string& key)
{
  m_sequences_counted++;

  const auto unique_seqs = m_sequence_counts.size();
  if (unique_seqs >= m_max_unique_sequences) {
    auto it = m_sequence_counts.find(key);
    if (it != m_sequence_counts.end()) {
      it->second++;
    }
  } else {
    m_sequence_counts[key]++;
    m_sequences_counted_at_max++;
  }
}

////////////////////////////////////////////////////////////////////////////////

namespace {

void
smoothed_gc_count(rates& distribution, size_t count, size_t length)
{
  // counts are smoothed across adjacent (percentage) bins
  const auto lower = std::max<double>(0, 100 * (count - 0.5) / length);
  const auto upper = std::min<double>(100, 100 * (count + 0.5) / length);

  const size_t lower_i = lower;
  const size_t upper_i = upper;

  const auto lower_f = lower - lower_i;
  const auto upper_f = upper - upper_i;

  if (lower_i == upper_i) {
    // Count falls in a single bin
    distribution.inc(lower_i, (upper - lower));
  } else {
    // Increment first bin/partially overlapped adjacent bins
    distribution.inc(lower_i, (1 - lower_f));
    distribution.inc(upper_i, upper_f);

    // Ranges are half open, except for the final bin
    const auto final_i = count == length ? 101 : upper_i;
    for (size_t i = lower + 1; i < final_i; ++i) {
      distribution.inc(i);
    }
  }
}

} // namespace

fastq_statistics::fastq_statistics(double sample_rate)
  : fastq_statistics(sample_rate, prng_seed())
{
}

fastq_statistics::fastq_statistics(double sample_rate, uint32_t seed)
  : m_sample_rate(sample_rate)
  , m_rng(seed)
  , m_quality_dist(PHRED_SCORE_MAX + 1)
  , m_gc_content_dist(101)
{
}

void
fastq_statistics::process(const fastq& read, size_t num_input_reads)
{
  m_number_of_input_reads += num_input_reads;
  m_number_of_output_reads++;

  const auto sequence_len = read.length();
  m_length_dist.resize_up_to(sequence_len + 1);
  m_length_dist.inc(sequence_len);

  if (std::generate_canonical<float, 32>(m_rng) <= m_sample_rate) {
    m_nucleotide_pos.resize_up_to(sequence_len);
    m_quality_pos.resize_up_to(sequence_len);

    m_number_of_sampled_reads++;

    const std::string_view sequence = read.sequence();
    const std::string_view qualities = read.qualities();

    indexed_count<ACGTN> nucls;
    for (size_t i = 0; i < sequence_len; ++i) {
      const auto nuc = sequence[i];

      nucls.inc(nuc);
      m_nucleotide_pos.inc_unsafe(nuc, i);

      const auto quality = qualities[i] - PHRED_OFFSET_MIN;
      m_quality_pos.inc_unsafe(nuc, i, quality);
      m_quality_dist.inc(quality);
    }

    auto n_at = nucls.get('A') + nucls.get('T');
    auto n_gc = nucls.get('G') + nucls.get('C');
    if (n_at || n_gc) {
      // uncalled bases are not considered for the length of the reads
      smoothed_gc_count(m_gc_content_dist, n_gc, n_gc + n_at);
    }
  }

  if (m_duplication) {
    m_duplication->process(read);
  }
}

void
fastq_statistics::init_duplication_stats(size_t max_unique_sequences)
{
  AR_REQUIRE(!m_duplication);
  m_duplication =
    std::make_shared<duplication_statistics>(max_unique_sequences);
}

fastq_statistics&
fastq_statistics::operator+=(const fastq_statistics& other)
{
  m_number_of_input_reads += other.m_number_of_input_reads;
  m_number_of_output_reads += other.m_number_of_output_reads;
  m_number_of_sampled_reads += other.m_number_of_sampled_reads;
  m_length_dist += other.m_length_dist;
  m_quality_dist += other.m_quality_dist;
  m_gc_content_dist += other.m_gc_content_dist;
  m_nucleotide_pos += other.m_nucleotide_pos;
  m_quality_pos += other.m_quality_pos;

  return *this;
}

trimming_statistics::trimming_statistics(double sample_rate)
  : singleton(std::make_shared<fastq_statistics>(sample_rate))
  , merged(std::make_shared<fastq_statistics>(sample_rate))
  , discarded(std::make_shared<fastq_statistics>(sample_rate))
{
  // Synchronize sampling of mate 1 and mate 2 reads
  const auto seed = prng_seed();
  read_1 = std::make_shared<fastq_statistics>(sample_rate, seed);
  read_2 = std::make_shared<fastq_statistics>(sample_rate, seed);
}

trimming_statistics&
trimming_statistics::operator+=(const trimming_statistics& other)
{
  *read_1 += *other.read_1;
  *read_2 += *other.read_2;
  *singleton += *other.singleton;
  *merged += *other.merged;
  *discarded += *other.discarded;

  insert_sizes += other.insert_sizes;
  adapter_trimmed_reads += other.adapter_trimmed_reads;
  adapter_trimmed_bases += other.adapter_trimmed_bases;
  reads_merged += other.reads_merged;
  terminal_pre_trimmed += other.terminal_pre_trimmed;
  terminal_post_trimmed += other.terminal_post_trimmed;
  poly_x_pre_trimmed_reads += other.poly_x_pre_trimmed_reads;
  poly_x_pre_trimmed_bases += other.poly_x_pre_trimmed_bases;
  poly_x_post_trimmed_reads += other.poly_x_post_trimmed_reads;
  poly_x_post_trimmed_bases += other.poly_x_post_trimmed_bases;
  low_quality_trimmed += other.low_quality_trimmed;
  total_trimmed += other.total_trimmed;
  filtered_min_length += other.filtered_min_length;
  filtered_max_length += other.filtered_max_length;
  filtered_ambiguous += other.filtered_ambiguous;
  filtered_mean_quality += other.filtered_mean_quality;
  filtered_low_complexity += other.filtered_low_complexity;

  return *this;
}

demux_statistics::demux_statistics(double sample_rate)
  : unidentified_stats_1(std::make_shared<fastq_statistics>(sample_rate))
  , unidentified_stats_2(std::make_shared<fastq_statistics>(sample_rate))
{
}

size_t
demux_statistics::total() const
{
  size_t total = unidentified + ambiguous;
  for (const auto count : samples) {
    total += count;
  }

  return total;
}

statistics::statistics(double sample_rate)
  : input_1(std::make_shared<fastq_statistics>(sample_rate))
  , input_2(std::make_shared<fastq_statistics>(sample_rate))
  , demultiplexing(std::make_shared<demux_statistics>(sample_rate))
{
}

statistics_builder::statistics_builder()
  : m_sample_count(0)
  , m_sample_rate(1.0)
  , m_max_unique(0)
  , m_adapter_id(0)
{
}

statistics_builder&
statistics_builder::sample_rate(double rate)
{
  m_sample_rate = rate;

  return *this;
}

statistics_builder&
statistics_builder::demultiplexing(size_t samples)
{
  m_sample_count = samples;

  return *this;
}

statistics_builder&
statistics_builder::estimate_duplication(size_t max_unique)
{
  m_max_unique = max_unique;

  return *this;
}

/** Enable adapter identification */
statistics_builder&
statistics_builder::adapter_identification(size_t max_length)
{
  m_adapter_id = max_length;

  return *this;
}

statistics
statistics_builder::initialize() const
{
  statistics stats(m_sample_rate);
  if (m_sample_count) {
    stats.demultiplexing->samples.resize(m_sample_count);
  }

  stats.input_1->init_duplication_stats(m_max_unique);
  stats.input_2->init_duplication_stats(m_max_unique);

  if (m_adapter_id) {
    stats.adapter_id = std::make_shared<adapter_id_statistics>(m_adapter_id);
  }

  return stats;
}

} // namespace adapterremoval

1	/*************************************************************************\
2	* AdapterRemoval - cleaning next-generation sequencing reads *
3	* *
4	* Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk *
5	* Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com *
6	* Copyright (C) 2010-17 by Simon Andrews *
7	* *
8	* This program is free software: you can redistribute it and/or modify *
9	* it under the terms of the GNU General Public License as published by *
10	* the Free Software Foundation, either version 3 of the License, or *
11	* (at your option) any later version. *
12	* *
13	* This program is distributed in the hope that it will be useful, *
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16	* GNU General Public License for more details. *
17	* *
18	* You should have received a copy of the GNU General Public License *
19	* along with this program. If not, see <http://www.gnu.org/licenses/>. *
20	\*************************************************************************/
21	#include "statistics.hpp"
22	#include "adapter_id.hpp" // for adapter_id_statistics
23	#include "debug.hpp" // for AR_REQUIRE
24	#include "fastq.hpp" // for ACGTN, fastq, ACGT
25	#include "fastq_enc.hpp" // for PHRED_OFFSET_MIN, PHRED_SCORE_MAX
26	#include "simd.hpp" // for size_t
27	#include "utilities.hpp" // for prng_seed
28	#include <algorithm> // for max, min
29	#include <cstdlib> // for size_t
30	#include <memory> // for make_shared, __shared_ptr_access, __shared_...
31	#include <string> // for basic_string, string
32	#include <string_view> // for string_view
33
34	namespace adapterremoval {
35
36	////////////////////////////////////////////////////////////////////////////////
37
38	const size_t DUPLICATION_LEVELS = 16;
39	const string_vec DUPLICATION_LABELS = { "1", "2", "3", "4",
40	"5", "6", "7", "8",
41	"9", ">10", ">50", ">100",
42	">500", ">1k", ">5k", ">10k" };
43
44	size_t
45	duplication_level(size_t count)	×
46	{
47	if (count > 10000) {	×
48	return 15;
49	} else if (count > 5000) {	×
50	return 14;
51	} else if (count > 1000) {	×
52	return 13;
53	} else if (count > 500) {	×
54	return 12;
55	} else if (count > 100) {	×
56	return 11;
57	} else if (count > 50) {	×
58	return 10;
59	} else if (count > 10) {	×
60	return 9;
61	} else {
62	return count - 1;	×
63	}
64	}
65
66	duplication_statistics::summary::summary()	×
67	: labels(DUPLICATION_LABELS)	×
68	, total_sequences(DUPLICATION_LEVELS)	×
69	, unique_sequences(DUPLICATION_LEVELS)	×
70	, unique_frac()	×
71	{
72	}
73
74	duplication_statistics::duplication_statistics(size_t max_unique_sequences)	×
75	: m_max_unique_sequences(max_unique_sequences)	×
76	, m_sequence_counts()	×
77	, m_sequences_counted()	×
78	, m_sequences_counted_at_max()	×
79	{
80	m_sequence_counts.reserve(max_unique_sequences);	×
81	}
82
83	void
84	duplication_statistics::process(const fastq& read)	×
85	{
86	if (m_max_unique_sequences) {	×
87	if (read.length() > 75) {	×
88	insert(read.sequence().substr(0, 50));	×
89	} else {
90	insert(read.sequence());	×
91	}
92	}
93	}
94
95	duplication_statistics::summary
96	duplication_statistics::summarize() const	×
97	{
98	// Histogram of sequence duplication counts
99	robin_hood::unordered_flat_map<size_t, size_t> histogram;	×
100	for (const auto& it : m_sequence_counts) {	×
101	histogram[it.second]++;	×
102	}
103
104	duplication_statistics::summary result;	×
105
106	result.total_sequences.resize_up_to(DUPLICATION_LEVELS);	×
107	result.unique_sequences.resize_up_to(DUPLICATION_LEVELS);	×
108
109	for (const auto& it : histogram) {	×
110	const auto level = it.first;	×
111	const auto count = correct_count(level, it.second);	×
112
113	const auto slot = duplication_level(level);	×
114	result.total_sequences.inc(slot, count * level);	×
115	result.unique_sequences.inc(slot, count);	×
116	}
117
118	const auto unique_count = result.unique_sequences.sum();	×
119	const auto total_count = result.total_sequences.sum();	×
120
121	if (total_count) {	×
122	result.unique_frac = unique_count / static_cast<double>(total_count);	×
123	result.total_sequences = result.total_sequences / total_count;	×
124	result.unique_sequences = result.unique_sequences / unique_count;	×
125	} else {
126	result.unique_frac = 1.0;	×
127	}
128
129	return result;	×
130	}
131
132	double
133	duplication_statistics::correct_count(size_t bin, size_t count) const	×
134	{
135	// Adapted from
136	// https://github.com/s-andrews/FastQC/blob/v0.11.9/uk/ac/babraham/FastQC/Modules/DuplicationLevel.java#L158
137
138	// See if we can bail out early
139	if (m_sequences_counted_at_max == m_sequences_counted) {	×
140	return count;	×
141	}
142
143	// If there aren't enough sequences left to hide another sequence with this
144	// count then we can also skip the calculation
145	if (m_sequences_counted - count < m_sequences_counted_at_max) {	×
146	return count;	×
147	}
148
149	// If not then we need to see what the likelihood is that we had another
150	// sequence with this number of observations which we would have missed.
151
152	// We'll start by working out the probability of NOT seeing a sequence with
153	// this duplication level within the first countAtLimit sequences of
154	// count. This is easier than calculating the probability of
155	// seeing it.
156	double p_not_seeing_at_limit = 1.0;	×
157
158	// To save doing long calculations which are never going to produce anything
159	// meaningful we'll set a limit to our p-value calculation. This is the
160	// probability below which we won't increase our count by 0.01 of an
161	// observation. Once we're below this we stop caring about the corrected
162	// value since it's going to be so close to the observed value that we can
163	// just return that instead.
164	const double limit_of_caring = 1.0 - (count / (count + 0.01));	×
165
166	for (size_t i = 0; i < m_sequences_counted_at_max; i++) {	×
167	p_not_seeing_at_limit *= ((m_sequences_counted - i) - bin) /	×
168	static_cast<double>(m_sequences_counted - i);
169
170	if (p_not_seeing_at_limit < limit_of_caring) {	×
171	p_not_seeing_at_limit = 0;
172	break;
173	}
174	}
175
176	// Scale by the chance of seeing
177	return count / (1 - p_not_seeing_at_limit);	×
178	}
179
180	void
181	duplication_statistics::insert(const std::string& key)	×
182	{
183	m_sequences_counted++;	×
184
185	const auto unique_seqs = m_sequence_counts.size();	×
186	if (unique_seqs >= m_max_unique_sequences) {	×
187	auto it = m_sequence_counts.find(key);	×
188	if (it != m_sequence_counts.end()) {	×
189	it->second++;	×
190	}
191	} else {
192	m_sequence_counts[key]++;	×
193	m_sequences_counted_at_max++;	×
194	}
195	}
196
197	////////////////////////////////////////////////////////////////////////////////
198
199	namespace {
200
201	void
202	smoothed_gc_count(rates& distribution, size_t count, size_t length)	×
203	{
204	// counts are smoothed across adjacent (percentage) bins
205	const auto lower = std::max<double>(0, 100 * (count - 0.5) / length);	×
206	const auto upper = std::min<double>(100, 100 * (count + 0.5) / length);	×
207
208	const size_t lower_i = lower;	×
209	const size_t upper_i = upper;	×
210
211	const auto lower_f = lower - lower_i;	×
212	const auto upper_f = upper - upper_i;	×
213
214	if (lower_i == upper_i) {	×
215	// Count falls in a single bin
216	distribution.inc(lower_i, (upper - lower));	×
217	} else {
218	// Increment first bin/partially overlapped adjacent bins
219	distribution.inc(lower_i, (1 - lower_f));	×
220	distribution.inc(upper_i, upper_f);	×
221
222	// Ranges are half open, except for the final bin
223	const auto final_i = count == length ? 101 : upper_i;	×
224	for (size_t i = lower + 1; i < final_i; ++i) {	×
225	distribution.inc(i);	×
226	}
227	}
228	}
229
230	} // namespace
231
232	fastq_statistics::fastq_statistics(double sample_rate)	×
233	: fastq_statistics(sample_rate, prng_seed())	×
234	{
235	}
236
237	fastq_statistics::fastq_statistics(double sample_rate, uint32_t seed)	×
238	: m_sample_rate(sample_rate)	×
239	, m_rng(seed)	×
240	, m_quality_dist(PHRED_SCORE_MAX + 1)	×
241	, m_gc_content_dist(101)	×
242	{
243	}
244
245	void
246	fastq_statistics::process(const fastq& read, size_t num_input_reads)	×
247	{
248	m_number_of_input_reads += num_input_reads;	×
249	m_number_of_output_reads++;	×
250
251	const auto sequence_len = read.length();	×
252	m_length_dist.resize_up_to(sequence_len + 1);	×
253	m_length_dist.inc(sequence_len);	×
254
255	if (std::generate_canonical<float, 32>(m_rng) <= m_sample_rate) {	×
256	m_nucleotide_pos.resize_up_to(sequence_len);	×
257	m_quality_pos.resize_up_to(sequence_len);	×
258
259	m_number_of_sampled_reads++;	×
260
261	const std::string_view sequence = read.sequence();	×
262	const std::string_view qualities = read.qualities();	×
263
264	indexed_count<ACGTN> nucls;	×
265	for (size_t i = 0; i < sequence_len; ++i) {	×
266	const auto nuc = sequence[i];	×
267
268	nucls.inc(nuc);	×
269	m_nucleotide_pos.inc_unsafe(nuc, i);	×
270
271	const auto quality = qualities[i] - PHRED_OFFSET_MIN;	×
272	m_quality_pos.inc_unsafe(nuc, i, quality);	×
273	m_quality_dist.inc(quality);	×
274	}
275
276	auto n_at = nucls.get('A') + nucls.get('T');	×
277	auto n_gc = nucls.get('G') + nucls.get('C');	×
278	if (n_at \|\| n_gc) {	×
279	// uncalled bases are not considered for the length of the reads
280	smoothed_gc_count(m_gc_content_dist, n_gc, n_gc + n_at);	×
281	}
282	}
283
284	if (m_duplication) {	×
285	m_duplication->process(read);	×
286	}
287	}
288
289	void
290	fastq_statistics::init_duplication_stats(size_t max_unique_sequences)	×
291	{
292	AR_REQUIRE(!m_duplication);	×
293	m_duplication =	×
294	std::make_shared<duplication_statistics>(max_unique_sequences);	×
295	}
296
297	fastq_statistics&
298	fastq_statistics::operator+=(const fastq_statistics& other)	×
299	{
300	m_number_of_input_reads += other.m_number_of_input_reads;	×
301	m_number_of_output_reads += other.m_number_of_output_reads;	×
302	m_number_of_sampled_reads += other.m_number_of_sampled_reads;	×
303	m_length_dist += other.m_length_dist;	×
304	m_quality_dist += other.m_quality_dist;	×
305	m_gc_content_dist += other.m_gc_content_dist;	×
306	m_nucleotide_pos += other.m_nucleotide_pos;	×
307	m_quality_pos += other.m_quality_pos;	×
308
309	return *this;	×
310	}
311
312	trimming_statistics::trimming_statistics(double sample_rate)	×
313	: singleton(std::make_shared<fastq_statistics>(sample_rate))	×
314	, merged(std::make_shared<fastq_statistics>(sample_rate))	×
315	, discarded(std::make_shared<fastq_statistics>(sample_rate))	×
316	{
317	// Synchronize sampling of mate 1 and mate 2 reads
318	const auto seed = prng_seed();	×
319	read_1 = std::make_shared<fastq_statistics>(sample_rate, seed);	×
320	read_2 = std::make_shared<fastq_statistics>(sample_rate, seed);	×
321	}
322
323	trimming_statistics&
324	trimming_statistics::operator+=(const trimming_statistics& other)	×
325	{
326	read_1 += other.read_1;	×
327	read_2 += other.read_2;	×
328	singleton += other.singleton;	×
329	merged += other.merged;	×
330	discarded += other.discarded;	×
331
332	insert_sizes += other.insert_sizes;	×
333	adapter_trimmed_reads += other.adapter_trimmed_reads;	×
334	adapter_trimmed_bases += other.adapter_trimmed_bases;	×
335	reads_merged += other.reads_merged;	×
336	terminal_pre_trimmed += other.terminal_pre_trimmed;	×
337	terminal_post_trimmed += other.terminal_post_trimmed;	×
338	poly_x_pre_trimmed_reads += other.poly_x_pre_trimmed_reads;	×
339	poly_x_pre_trimmed_bases += other.poly_x_pre_trimmed_bases;	×
340	poly_x_post_trimmed_reads += other.poly_x_post_trimmed_reads;	×
341	poly_x_post_trimmed_bases += other.poly_x_post_trimmed_bases;	×
342	low_quality_trimmed += other.low_quality_trimmed;	×
343	total_trimmed += other.total_trimmed;	×
344	filtered_min_length += other.filtered_min_length;	×
345	filtered_max_length += other.filtered_max_length;	×
346	filtered_ambiguous += other.filtered_ambiguous;	×
347	filtered_mean_quality += other.filtered_mean_quality;	×
348	filtered_low_complexity += other.filtered_low_complexity;	×
349
350	return *this;	×
351	}
352
353	demux_statistics::demux_statistics(double sample_rate)	×
354	: unidentified_stats_1(std::make_shared<fastq_statistics>(sample_rate))	×
355	, unidentified_stats_2(std::make_shared<fastq_statistics>(sample_rate))	×
356	{
357	}
358
359	size_t
360	demux_statistics::total() const	×
361	{
362	size_t total = unidentified + ambiguous;	×
363	for (const auto count : samples) {	×
364	total += count;	×
365	}
366
367	return total;	×
368	}
369
370	statistics::statistics(double sample_rate)	×
371	: input_1(std::make_shared<fastq_statistics>(sample_rate))	×
372	, input_2(std::make_shared<fastq_statistics>(sample_rate))	×
373	, demultiplexing(std::make_shared<demux_statistics>(sample_rate))	×
374	{
375	}
376
377	statistics_builder::statistics_builder()	×
378	: m_sample_count(0)	×
379	, m_sample_rate(1.0)	×
380	, m_max_unique(0)	×
381	, m_adapter_id(0)	×
382	{
383	}
384
385	statistics_builder&
386	statistics_builder::sample_rate(double rate)	×
387	{
388	m_sample_rate = rate;	×
389
390	return *this;	×
391	}
392
393	statistics_builder&
394	statistics_builder::demultiplexing(size_t samples)	×
395	{
396	m_sample_count = samples;	×
397
398	return *this;	×
399	}
400
401	statistics_builder&
402	statistics_builder::estimate_duplication(size_t max_unique)	×
403	{
404	m_max_unique = max_unique;	×
405
406	return *this;	×
407	}
408
409	/** Enable adapter identification */
410	statistics_builder&
411	statistics_builder::adapter_identification(size_t max_length)	×
412	{
413	m_adapter_id = max_length;	×
414
415	return *this;	×
416	}
417
418	statistics
419	statistics_builder::initialize() const	×
420	{
421	statistics stats(m_sample_rate);	×
422	if (m_sample_count) {	×
423	stats.demultiplexing->samples.resize(m_sample_count);	×
424	}
425
426	stats.input_1->init_duplication_stats(m_max_unique);	×
427	stats.input_2->init_duplication_stats(m_max_unique);	×
428
429	if (m_adapter_id) {	×
430	stats.adapter_id = std::make_shared<adapter_id_statistics>(m_adapter_id);	×
431	}
432
433	return stats;	×
434	}	×
435
436	} // namespace adapterremoval	×

MikkelSchubert / adapterremoval / #45

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous