MikkelSchubert / adapterremoval / #45

Committed 20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%

Build # #45

Build Type

push

travis-ci

Committed by

web-flow

Commit Message

attempt to fix coveralls run

Run Details

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/output.cpp

/*************************************************************************\
 * AdapterRemoval - cleaning next-generation sequencing reads            *
 *                                                                       *
 * Copyright (C) 2024 by Mikkel Schubert - mikkelsch@gmail.com           *
 *                                                                       *
 * This program is free software: you can redistribute it and/or modify  *
 * it under the terms of the GNU General Public License as published by  *
 * the Free Software Foundation, either version 3 of the License, or     *
 * (at your option) any later version.                                   *
 *                                                                       *
 * This program is distributed in the hope that it will be useful,       *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 * GNU General Public License for more details.                          *
 *                                                                       *
 * You should have received a copy of the GNU General Public License     *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
\*************************************************************************/
#include "output.hpp"     // declarations
#include "buffer.hpp"     // for buffer
#include "debug.hpp"      // for AR_REQUIRE, AR_FAIL
#include "fastq.hpp"      // for fastq
#include "fastq_io.hpp"   // for write_fastq, split_fastq, gzip_split_fastq
#include "scheduler.hpp"  // for analytical_chunk
#include "serializer.hpp" // for fastq_serialzier
#include "strutils.hpp"   // for ends_with
#include <limits>         // for numeric_limits

namespace adapterremoval {

class userconfig;

namespace {

buffer&
get_buffer(chunk_ptr& chunk)
{
  AR_REQUIRE(chunk);
  if (chunk->buffers.empty()) {
    chunk->buffers.emplace_back();
  }

  return chunk->buffers.back();
}

} // namespace

////////////////////////////////////////////////////////////////////////////////
// Implementations for `sample_output_files`

const size_t sample_output_files::disabled = std::numeric_limits<size_t>::max();

size_t
add_write_step(scheduler& sch,
               const userconfig& config,
               const output_file& file)
{
  AR_REQUIRE(file.name != DEV_NULL);
  size_t step_id = sch.add<write_fastq>(config, file);

  switch (file.format) {
    case output_format::fastq:
    case output_format::sam:
      break;
    case output_format::fastq_gzip:
    case output_format::sam_gzip:
    case output_format::bam:
    case output_format::ubam: {
      step_id = sch.add<gzip_split_fastq>(config, file, step_id);
      step_id = sch.add<split_fastq>(config, file, step_id);
      break;
    }
    default:
      AR_FAIL("invalid output format");
  }

  return step_id;
}

sample_output_files::sample_output_files()
{
  m_offsets.fill(sample_output_files::disabled);
}

void
sample_output_files::set_file(const read_type rtype, output_file file)
{
  const auto index = static_cast<size_t>(rtype);
  AR_REQUIRE(m_offsets.at(index) == sample_output_files::disabled);

  // If the file type isn't being saved, then there is no need to process the
  // reads. This saves time especially when output compression is enabled.
  if (file.name != DEV_NULL) {
    // FIXME: This assumes that filesystem is case sensitive
    auto it = m_output.begin();
    for (; it != m_output.end(); ++it) {
      if (file.name == it->file.name) {
        break;
      }
    }

    if (it == m_output.end()) {
      m_output.push_back({ std::move(file), disabled });
      m_offsets.at(index) = m_output.size() - 1;
    } else {
      AR_REQUIRE(file == it->file);
      const auto existing_index = it - m_output.begin();
      m_offsets.at(index) = existing_index;
    }
  }
}

void
sample_output_files::set_step(read_type rtype, size_t step)
{
  const auto index = static_cast<size_t>(rtype);
  const auto offset = m_offsets.at(index);
  AR_REQUIRE(offset != disabled);

  AR_REQUIRE(m_output.at(offset).step == disabled);
  m_output.at(offset).step = step;
}

size_t
sample_output_files::offset(read_type value) const
{
  return m_offsets.at(static_cast<size_t>(value));
}

////////////////////////////////////////////////////////////////////////////////
// Implementations for `output_files`

const size_t output_files::disabled = static_cast<size_t>(-1);

bool
output_files::parse_format(std::string_view filename, output_format& sink)
{
  const std::string value = to_lower(std::string(filename));
  // ubam is tested separately, to avoid supporting non-standard file extensions
  if (value == "ubam") {
    sink = output_format::ubam;
    return true;
  }

  return parse_extension("." + value, sink);
}

bool
output_files::parse_extension(std::string_view filename, output_format& sink)
{
  const std::string value = to_lower(std::string(filename));
  if (ends_with(value, ".fq.gz") || ends_with(value, ".fastq.gz")) {
    sink = output_format::fastq_gzip;
  } else if (ends_with(value, ".fq") || ends_with(value, ".fastq")) {
    sink = output_format::fastq;
  } else if (ends_with(value, ".sam.gz")) {
    sink = output_format::sam_gzip;
  } else if (ends_with(value, ".sam")) {
    sink = output_format::sam;
  } else if (ends_with(value, ".bam")) {
    sink = output_format::bam;
  } else {
    return false;
  }

  return true;
}

std::string_view
output_files::file_extension(const output_format format)
{
  switch (format) {
    case output_format::fastq:
      return ".fastq";
    case output_format::fastq_gzip:
      return ".fastq.gz";
    case output_format::sam:
      return ".sam";
    case output_format::sam_gzip:
      return ".sam.gz";
    case output_format::bam:
    case output_format::ubam:
      return ".bam";
    default:
      AR_FAIL("invalid output format");
  }
}

void
output_files::add_write_steps(scheduler& sch, const userconfig& config)
{
  AR_REQUIRE(unidentified_1_step == disabled &&
             unidentified_2_step == disabled);

  for (auto& sample : m_samples) {
    for (auto& it : sample.m_output) {
      AR_REQUIRE(it.step == disabled);
      it.step = add_write_step(sch, config, it.file);
    }
  }

  if (unidentified_1.name != DEV_NULL) {
    unidentified_1_step = add_write_step(sch, config, unidentified_1);
  }

  if (unidentified_1.name == unidentified_2.name) {
    unidentified_2_step = unidentified_1_step;
  } else if (unidentified_2.name != DEV_NULL) {
    unidentified_2_step = add_write_step(sch, config, unidentified_2);
  }
}

////////////////////////////////////////////////////////////////////////////////
// Implementations for `processed_reads`

processed_reads::processed_reads(const sample_output_files& map)
  : m_map(map)
{
  for (size_t i = 0; i < map.size(); ++i) {
    m_chunks.emplace_back(std::make_unique<analytical_chunk>());
    m_serializers.emplace_back(map.format(i));
  }
}

void
processed_reads::set_sample(const sample& value)
{
  for (auto& ser : m_serializers) {
    ser.set_sample(value);
  }
}

void
processed_reads::set_mate_separator(char value)
{
  m_mate_separator = value;
  for (auto& ser : m_serializers) {
    ser.set_mate_separator(value);
  }
}

void
processed_reads::set_demultiplexing_only(bool value)
{
  for (auto& ser : m_serializers) {
    ser.set_demultiplexing_only(value);
  }
}

void
processed_reads::write_headers(const string_vec& args)
{
  for (size_t i = 0; i < m_chunks.size(); ++i) {
    m_serializers.at(i).header(get_buffer(m_chunks.at(i)), args);
  }
}

void
processed_reads::add(const fastq& read,
                     const read_type type,
                     const fastq_flags flags,
                     const size_t barcode)
{
  const size_t offset = m_map.offset(type);
  if (offset != sample_output_files::disabled) {
    auto& buffer = get_buffer(m_chunks.at(offset));
    m_serializers.at(offset).record(buffer, read, flags, barcode);
    m_chunks.at(offset)->reads++;
  }
}

chunk_vec
processed_reads::finalize(bool eof)
{
  chunk_vec chunks;

  for (size_t i = 0; i < m_chunks.size(); ++i) {
    auto& chunk = m_chunks.at(i);
    chunk->mate_separator = m_mate_separator;
    chunk->eof = eof;

    chunks.emplace_back(m_map.step(i), std::move(chunk));
  }

  return chunks;
}

///////////////////////////////////////////////////////////////////////////////
// Implementations for `post_demux_steps`

const size_t post_demux_steps::disabled = output_files::disabled;

///////////////////////////////////////////////////////////////////////////////
// Implementations for `demultiplexed_reads`

namespace {

void
flush_chunk(chunk_vec& output,
            chunk_ptr& ptr,
            size_t step,
            const bool eof,
            const char mate_separator)
{
  if (eof || ptr->reads >= INPUT_READS) {
    ptr->eof = eof;
    ptr->mate_separator = mate_separator;
    output.emplace_back(step, std::move(ptr));
    ptr = std::make_unique<analytical_chunk>();
  }
}

} // namespace

demultiplexed_reads::demultiplexed_reads(const post_demux_steps& steps)
  : m_steps(steps)
{
  // Position 0 is used for unidentified reads
  m_cache.push_back(std::make_unique<analytical_chunk>());

  for (const auto next_step : m_steps.samples) {
    AR_REQUIRE(next_step != post_demux_steps::disabled);

    m_cache.push_back(std::make_unique<analytical_chunk>());
  }

  // The first chunks should have headers written depending on format
  for (auto& it : m_cache) {
    it->first = true;
  }
}

void
demultiplexed_reads::add_unidentified_1(fastq&& read)
{
  m_cache.at(0)->reads_1.push_back(std::move(read));
}

void
demultiplexed_reads::add_unidentified_2(fastq&& read)
{
  m_cache.at(0)->reads_2.push_back(std::move(read));
}

void
demultiplexed_reads::add_read_1(fastq&& read, size_t sample, size_t barcode)
{
  auto& chunk = *m_cache.at(sample + 1);
  chunk.reads_1.push_back(std::move(read));
  chunk.barcodes.push_back(barcode);
}

void
demultiplexed_reads::add_read_2(fastq&& read, size_t sample)
{
  m_cache.at(sample + 1)->reads_2.push_back(std::move(read));
}

chunk_vec
demultiplexed_reads::flush(bool eof, char mate_separator)
{
  chunk_vec output;

  // Unidentified reads; these are treated as a pseudo-sample for simplicity
  flush_chunk(output, m_cache.at(0), m_steps.unidentified, eof, mate_separator);

  for (size_t i = 1; i < m_cache.size(); ++i) {
    flush_chunk(
      output, m_cache.at(i), m_steps.samples.at(i - 1), eof, mate_separator);
  }

  return output;
}

} // namespace adapterremoval

1	/*************************************************************************\
2	* AdapterRemoval - cleaning next-generation sequencing reads *
3	* *
4	* Copyright (C) 2024 by Mikkel Schubert - mikkelsch@gmail.com *
5	* *
6	* This program is free software: you can redistribute it and/or modify *
7	* it under the terms of the GNU General Public License as published by *
8	* the Free Software Foundation, either version 3 of the License, or *
9	* (at your option) any later version. *
10	* *
11	* This program is distributed in the hope that it will be useful, *
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14	* GNU General Public License for more details. *
15	* *
16	* You should have received a copy of the GNU General Public License *
17	* along with this program. If not, see <http://www.gnu.org/licenses/>. *
18	\*************************************************************************/
19	#include "output.hpp" // declarations
20	#include "buffer.hpp" // for buffer
21	#include "debug.hpp" // for AR_REQUIRE, AR_FAIL
22	#include "fastq.hpp" // for fastq
23	#include "fastq_io.hpp" // for write_fastq, split_fastq, gzip_split_fastq
24	#include "scheduler.hpp" // for analytical_chunk
25	#include "serializer.hpp" // for fastq_serialzier
26	#include "strutils.hpp" // for ends_with
27	#include <limits> // for numeric_limits
28
29	namespace adapterremoval {
30
31	class userconfig;
32
33	namespace {
34
35	buffer&
36	get_buffer(chunk_ptr& chunk)	×
37	{
38	AR_REQUIRE(chunk);	×
39	if (chunk->buffers.empty()) {	×
40	chunk->buffers.emplace_back();	×
41	}
42
43	return chunk->buffers.back();	×
44	}
45
46	} // namespace
47
48	////////////////////////////////////////////////////////////////////////////////
49	// Implementations for `sample_output_files`
50
51	const size_t sample_output_files::disabled = std::numeric_limits<size_t>::max();
52
53	size_t
54	add_write_step(scheduler& sch,	×
55	const userconfig& config,
56	const output_file& file)
57	{
58	AR_REQUIRE(file.name != DEV_NULL);	×
59	size_t step_id = sch.add<write_fastq>(config, file);	×
60
61	switch (file.format) {	×
62	case output_format::fastq:
63	case output_format::sam:
64	break;
65	case output_format::fastq_gzip:	×
66	case output_format::sam_gzip:	×
67	case output_format::bam:	×
68	case output_format::ubam: {	×
69	step_id = sch.add<gzip_split_fastq>(config, file, step_id);	×
70	step_id = sch.add<split_fastq>(config, file, step_id);	×
71	break;	×
72	}
73	default:	×
74	AR_FAIL("invalid output format");	×
75	}
76
77	return step_id;	×
78	}
79
80	sample_output_files::sample_output_files()	×
81	{
82	m_offsets.fill(sample_output_files::disabled);	×
83	}
84
85	void
86	sample_output_files::set_file(const read_type rtype, output_file file)	×
87	{
88	const auto index = static_cast<size_t>(rtype);	×
89	AR_REQUIRE(m_offsets.at(index) == sample_output_files::disabled);	×
90
91	// If the file type isn't being saved, then there is no need to process the
92	// reads. This saves time especially when output compression is enabled.
93	if (file.name != DEV_NULL) {	×
94	// FIXME: This assumes that filesystem is case sensitive
95	auto it = m_output.begin();	×
96	for (; it != m_output.end(); ++it) {	×
97	if (file.name == it->file.name) {	×
98	break;
99	}
100	}
101
102	if (it == m_output.end()) {	×
103	m_output.push_back({ std::move(file), disabled });	×
104	m_offsets.at(index) = m_output.size() - 1;	×
105	} else {
106	AR_REQUIRE(file == it->file);	×
107	const auto existing_index = it - m_output.begin();	×
108	m_offsets.at(index) = existing_index;	×
109	}
110	}
111	}
112
113	void
114	sample_output_files::set_step(read_type rtype, size_t step)	×
115	{
116	const auto index = static_cast<size_t>(rtype);	×
117	const auto offset = m_offsets.at(index);	×
118	AR_REQUIRE(offset != disabled);	×
119
120	AR_REQUIRE(m_output.at(offset).step == disabled);	×
121	m_output.at(offset).step = step;	×
122	}
123
124	size_t
125	sample_output_files::offset(read_type value) const	×
126	{
127	return m_offsets.at(static_cast<size_t>(value));	×
128	}
129
130	////////////////////////////////////////////////////////////////////////////////
131	// Implementations for `output_files`
132
133	const size_t output_files::disabled = static_cast<size_t>(-1);
134
135	bool
136	output_files::parse_format(std::string_view filename, output_format& sink)	×
137	{
138	const std::string value = to_lower(std::string(filename));	×
139	// ubam is tested separately, to avoid supporting non-standard file extensions
140	if (value == "ubam") {	×
141	sink = output_format::ubam;	×
142	return true;	×
143	}
144
145	return parse_extension("." + value, sink);	×
146	}
147
148	bool
149	output_files::parse_extension(std::string_view filename, output_format& sink)	×
150	{
151	const std::string value = to_lower(std::string(filename));	×
152	if (ends_with(value, ".fq.gz") \|\| ends_with(value, ".fastq.gz")) {	×
153	sink = output_format::fastq_gzip;	×
154	} else if (ends_with(value, ".fq") \|\| ends_with(value, ".fastq")) {	×
155	sink = output_format::fastq;	×
156	} else if (ends_with(value, ".sam.gz")) {	×
157	sink = output_format::sam_gzip;	×
158	} else if (ends_with(value, ".sam")) {	×
159	sink = output_format::sam;	×
160	} else if (ends_with(value, ".bam")) {	×
161	sink = output_format::bam;	×
162	} else {
163	return false;
164	}
165
166	return true;
167	}
168
169	std::string_view
170	output_files::file_extension(const output_format format)	×
171	{
172	switch (format) {	×
173	case output_format::fastq:	×
174	return ".fastq";	×
175	case output_format::fastq_gzip:	×
176	return ".fastq.gz";	×
177	case output_format::sam:	×
178	return ".sam";	×
179	case output_format::sam_gzip:	×
180	return ".sam.gz";	×
181	case output_format::bam:	×
182	case output_format::ubam:	×
183	return ".bam";	×
184	default:	×
185	AR_FAIL("invalid output format");	×
186	}
187	}
188
189	void
190	output_files::add_write_steps(scheduler& sch, const userconfig& config)	×
191	{
192	AR_REQUIRE(unidentified_1_step == disabled &&	×
193	unidentified_2_step == disabled);
194
195	for (auto& sample : m_samples) {	×
196	for (auto& it : sample.m_output) {	×
197	AR_REQUIRE(it.step == disabled);	×
198	it.step = add_write_step(sch, config, it.file);	×
199	}
200	}
201
202	if (unidentified_1.name != DEV_NULL) {	×
203	unidentified_1_step = add_write_step(sch, config, unidentified_1);	×
204	}
205
206	if (unidentified_1.name == unidentified_2.name) {	×
207	unidentified_2_step = unidentified_1_step;	×
208	} else if (unidentified_2.name != DEV_NULL) {	×
209	unidentified_2_step = add_write_step(sch, config, unidentified_2);	×
210	}
211	}
212
213	////////////////////////////////////////////////////////////////////////////////
214	// Implementations for `processed_reads`
215
216	processed_reads::processed_reads(const sample_output_files& map)	×
217	: m_map(map)	×
218	{
219	for (size_t i = 0; i < map.size(); ++i) {	×
220	m_chunks.emplace_back(std::make_unique<analytical_chunk>());	×
221	m_serializers.emplace_back(map.format(i));	×
222	}
223	}
224
225	void
226	processed_reads::set_sample(const sample& value)	×
227	{
228	for (auto& ser : m_serializers) {	×
229	ser.set_sample(value);	×
230	}
231	}
232
233	void
234	processed_reads::set_mate_separator(char value)	×
235	{
236	m_mate_separator = value;	×
237	for (auto& ser : m_serializers) {	×
238	ser.set_mate_separator(value);	×
239	}
240	}
241
242	void
243	processed_reads::set_demultiplexing_only(bool value)	×
244	{
245	for (auto& ser : m_serializers) {	×
246	ser.set_demultiplexing_only(value);	×
247	}
248	}
249
250	void
251	processed_reads::write_headers(const string_vec& args)	×
252	{
253	for (size_t i = 0; i < m_chunks.size(); ++i) {	×
254	m_serializers.at(i).header(get_buffer(m_chunks.at(i)), args);	×
255	}
256	}
257
258	void
259	processed_reads::add(const fastq& read,	×
260	const read_type type,
261	const fastq_flags flags,
262	const size_t barcode)
263	{
264	const size_t offset = m_map.offset(type);	×
265	if (offset != sample_output_files::disabled) {	×
266	auto& buffer = get_buffer(m_chunks.at(offset));	×
267	m_serializers.at(offset).record(buffer, read, flags, barcode);	×
268	m_chunks.at(offset)->reads++;	×
269	}
270	}
271
272	chunk_vec
273	processed_reads::finalize(bool eof)	×
274	{
275	chunk_vec chunks;	×
276
277	for (size_t i = 0; i < m_chunks.size(); ++i) {	×
278	auto& chunk = m_chunks.at(i);	×
279	chunk->mate_separator = m_mate_separator;	×
280	chunk->eof = eof;	×
281
282	chunks.emplace_back(m_map.step(i), std::move(chunk));	×
283	}
284
285	return chunks;	×
286	}	×
287
288	///////////////////////////////////////////////////////////////////////////////
289	// Implementations for `post_demux_steps`
290
291	const size_t post_demux_steps::disabled = output_files::disabled;
292
293	///////////////////////////////////////////////////////////////////////////////
294	// Implementations for `demultiplexed_reads`
295
296	namespace {
297
298	void
299	flush_chunk(chunk_vec& output,	×
300	chunk_ptr& ptr,
301	size_t step,
302	const bool eof,
303	const char mate_separator)
304	{
305	if (eof \|\| ptr->reads >= INPUT_READS) {	×
306	ptr->eof = eof;	×
307	ptr->mate_separator = mate_separator;	×
308	output.emplace_back(step, std::move(ptr));	×
309	ptr = std::make_unique<analytical_chunk>();	×
310	}
311	}
312
313	} // namespace
314
315	demultiplexed_reads::demultiplexed_reads(const post_demux_steps& steps)	×
316	: m_steps(steps)	×
317	{
318	// Position 0 is used for unidentified reads
319	m_cache.push_back(std::make_unique<analytical_chunk>());	×
320
321	for (const auto next_step : m_steps.samples) {	×
322	AR_REQUIRE(next_step != post_demux_steps::disabled);	×
323
324	m_cache.push_back(std::make_unique<analytical_chunk>());	×
325	}
326
327	// The first chunks should have headers written depending on format
328	for (auto& it : m_cache) {	×
329	it->first = true;	×
330	}
331	}
332
333	void
334	demultiplexed_reads::add_unidentified_1(fastq&& read)	×
335	{
336	m_cache.at(0)->reads_1.push_back(std::move(read));	×
337	}
338
339	void
340	demultiplexed_reads::add_unidentified_2(fastq&& read)	×
341	{
342	m_cache.at(0)->reads_2.push_back(std::move(read));	×
343	}
344
345	void
346	demultiplexed_reads::add_read_1(fastq&& read, size_t sample, size_t barcode)	×
347	{
348	auto& chunk = *m_cache.at(sample + 1);	×
349	chunk.reads_1.push_back(std::move(read));	×
350	chunk.barcodes.push_back(barcode);	×
351	}
352
353	void
354	demultiplexed_reads::add_read_2(fastq&& read, size_t sample)	×
355	{
356	m_cache.at(sample + 1)->reads_2.push_back(std::move(read));	×
357	}
358
359	chunk_vec
360	demultiplexed_reads::flush(bool eof, char mate_separator)	×
361	{
362	chunk_vec output;	×
363
364	// Unidentified reads; these are treated as a pseudo-sample for simplicity
365	flush_chunk(output, m_cache.at(0), m_steps.unidentified, eof, mate_separator);	×
366
367	for (size_t i = 1; i < m_cache.size(); ++i) {	×
368	flush_chunk(	×
369	output, m_cache.at(i), m_steps.samples.at(i - 1), eof, mate_separator);	×
370	}
371
372	return output;	×
373	}	×
374
375	} // namespace adapterremoval	×

MikkelSchubert / adapterremoval / #45

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous