#101

Committed 18 Apr 2025 12:19PM UTC coverage: 67.186% (+1.8%) from 65.404%

Build # #101

Build Type

push

travis-ci

Committed by

web-flow

Commit Message

add meta-data class for read serialization (#127)

This simplifies passing of additional information (to be added) that
influences how reads are serialized.

The read_type enum is further more expanded, allowing the file_type
to be derived from the read_type enum by the meta data class.

Test cases were added for the serializers

Run Details

472 of 531 new or added lines in 8 files covered. (88.89%)

2 existing lines in 1 file now uncovered.

9697 of 14433 relevant lines covered (67.19%)

3063.78 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/output.cpp

// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2024 Mikkel Schubert <mikkelsch@gmail.com>
#include "output.hpp"     // declarations
#include "buffer.hpp"     // for buffer
#include "debug.hpp"      // for AR_REQUIRE, AR_FAIL
#include "fastq.hpp"      // for fastq
#include "fastq_io.hpp"   // for write_fastq, split_fastq, gzip_split_fastq
#include "scheduler.hpp"  // for analytical_chunk
#include "serializer.hpp" // for read_meta
#include "strutils.hpp"   // for ends_with
#include <limits>         // for numeric_limits

namespace adapterremoval {

class userconfig;

namespace {

buffer&
get_buffer(chunk_ptr& chunk)
{
  AR_REQUIRE(chunk);
  if (chunk->buffers.empty()) {
    chunk->buffers.emplace_back();
  }

  return chunk->buffers.back();
}

} // namespace

////////////////////////////////////////////////////////////////////////////////
// Implementations for `sample_output_files`

const size_t sample_output_files::disabled = std::numeric_limits<size_t>::max();

size_t
add_write_step(scheduler& sch,
               const userconfig& config,
               const output_file& file)
{
  AR_REQUIRE(file.name != DEV_NULL);
  size_t step_id = sch.add<write_fastq>(config, file);

  switch (file.format) {
    case output_format::fastq:
    case output_format::sam:
      break;
    case output_format::fastq_gzip:
    case output_format::sam_gzip:
    case output_format::bam:
    case output_format::ubam: {
      step_id = sch.add<gzip_split_fastq>(config, file, step_id);
      step_id = sch.add<split_fastq>(config, file, step_id);
      break;
    }
    default:
      AR_FAIL("invalid output format");
  }

  return step_id;
}

sample_output_files::sample_output_files()
{
  m_offsets.fill(sample_output_files::disabled);
}

void
sample_output_files::set_file(const read_file rtype, output_file file)
{
  const auto index = static_cast<size_t>(rtype);
  AR_REQUIRE(m_offsets.at(index) == sample_output_files::disabled);

  // If the file type isn't being saved, then there is no need to process the
  // reads. This saves time especially when output compression is enabled.
  if (file.name != DEV_NULL) {
    // FIXME: This assumes that filesystem is case sensitive
    auto it = m_output.begin();
    for (; it != m_output.end(); ++it) {
      if (file.name == it->file.name) {
        break;
      }
    }

    if (it == m_output.end()) {
      m_output.push_back({ std::move(file), disabled });
      m_offsets.at(index) = m_output.size() - 1;
    } else {
      AR_REQUIRE(file == it->file);
      const auto existing_index = it - m_output.begin();
      m_offsets.at(index) = existing_index;
    }
  }
}

void
sample_output_files::set_step(read_file rtype, size_t step)
{
  const auto index = static_cast<size_t>(rtype);
  const auto offset = m_offsets.at(index);
  AR_REQUIRE(offset != disabled);

  AR_REQUIRE(m_output.at(offset).step == disabled);
  m_output.at(offset).step = step;
}

size_t
sample_output_files::offset(read_file value) const
{
  return m_offsets.at(static_cast<size_t>(value));
}

////////////////////////////////////////////////////////////////////////////////
// Implementations for `output_files`

const size_t output_files::disabled = static_cast<size_t>(-1);

bool
output_files::parse_format(std::string_view filename, output_format& sink)
{
  const std::string value = to_lower(std::string(filename));
  // ubam is tested separately, to avoid supporting non-standard file extensions
  if (value == "ubam") {
    sink = output_format::ubam;
    return true;
  }

  return parse_extension("." + value, sink);
}

bool
output_files::parse_extension(std::string_view filename, output_format& sink)
{
  const std::string value = to_lower(std::string(filename));
  if (ends_with(value, ".fq.gz") || ends_with(value, ".fastq.gz")) {
    sink = output_format::fastq_gzip;
  } else if (ends_with(value, ".fq") || ends_with(value, ".fastq")) {
    sink = output_format::fastq;
  } else if (ends_with(value, ".sam.gz")) {
    sink = output_format::sam_gzip;
  } else if (ends_with(value, ".sam")) {
    sink = output_format::sam;
  } else if (ends_with(value, ".bam")) {
    sink = output_format::bam;
  } else {
    return false;
  }

  return true;
}

std::string_view
output_files::file_extension(const output_format format)
{
  switch (format) {
    case output_format::fastq:
      return ".fastq";
    case output_format::fastq_gzip:
      return ".fastq.gz";
    case output_format::sam:
      return ".sam";
    case output_format::sam_gzip:
      return ".sam.gz";
    case output_format::bam:
    case output_format::ubam:
      return ".bam";
    default:
      AR_FAIL("invalid output format");
  }
}

void
output_files::add_write_steps(scheduler& sch, const userconfig& config)
{
  AR_REQUIRE(unidentified_1_step == disabled &&
             unidentified_2_step == disabled);

  for (auto& sample : m_samples) {
    for (auto& it : sample.m_output) {
      AR_REQUIRE(it.step == disabled);
      it.step = add_write_step(sch, config, it.file);
    }
  }

  if (unidentified_1.name != DEV_NULL) {
    unidentified_1_step = add_write_step(sch, config, unidentified_1);
  }

  if (unidentified_1.name == unidentified_2.name) {
    unidentified_2_step = unidentified_1_step;
  } else if (unidentified_2.name != DEV_NULL) {
    unidentified_2_step = add_write_step(sch, config, unidentified_2);
  }
}

////////////////////////////////////////////////////////////////////////////////
// Implementations for `processed_reads`

processed_reads::processed_reads(const sample_output_files& map)
  : m_map(map)
{
  for (size_t i = 0; i < map.size(); ++i) {
    m_chunks.emplace_back(std::make_unique<analytical_chunk>());
    m_serializers.emplace_back(map.format(i));
  }
}

void
processed_reads::set_sample(const sample& value)
{
  for (auto& it : m_serializers) {
    it.set_sample(value);
  }
}

void
processed_reads::set_mate_separator(char value)
{
  m_mate_separator = value;
  for (auto& it : m_serializers) {
    it.set_mate_separator(value);
  }
}

void
processed_reads::set_demultiplexing_only(bool value)
{
  for (auto& it : m_serializers) {
    it.set_demultiplexing_only(value);
  }
}

void
processed_reads::write_headers(const string_vec& args)
{
  for (size_t i = 0; i < m_chunks.size(); ++i) {
    m_serializers.at(i).header(get_buffer(m_chunks.at(i)), args);
  }
}

void
processed_reads::add(const fastq& read, const read_meta& meta)
{
  const size_t offset = m_map.offset(meta.get_file());
  if (offset != sample_output_files::disabled) {
    auto& buffer = get_buffer(m_chunks.at(offset));
    m_serializers.at(offset).record(buffer, read, meta);
    m_chunks.at(offset)->reads++;
  }
}

chunk_vec
processed_reads::finalize(bool eof)
{
  chunk_vec chunks;

  for (size_t i = 0; i < m_chunks.size(); ++i) {
    auto& chunk = m_chunks.at(i);
    chunk->mate_separator = m_mate_separator;
    chunk->eof = eof;

    chunks.emplace_back(m_map.step(i), std::move(chunk));
  }

  return chunks;
}

///////////////////////////////////////////////////////////////////////////////
// Implementations for `post_demux_steps`

const size_t post_demux_steps::disabled = output_files::disabled;

///////////////////////////////////////////////////////////////////////////////
// Implementations for `demultiplexed_reads`

namespace {

void
flush_chunk(chunk_vec& output,
            chunk_ptr& ptr,
            size_t step,
            const bool eof,
            const char mate_separator)
{
  if (eof || ptr->reads >= INPUT_READS) {
    ptr->eof = eof;
    ptr->mate_separator = mate_separator;
    output.emplace_back(step, std::move(ptr));
    ptr = std::make_unique<analytical_chunk>();
  }
}

} // namespace

demultiplexed_reads::demultiplexed_reads(const post_demux_steps& steps)
  : m_steps(steps)
{
  // Position 0 is used for unidentified reads
  m_cache.push_back(std::make_unique<analytical_chunk>());

  for (const auto next_step : m_steps.samples) {
    AR_REQUIRE(next_step != post_demux_steps::disabled);

    m_cache.push_back(std::make_unique<analytical_chunk>());
  }

  // The first chunks should have headers written depending on format
  for (auto& it : m_cache) {
    it->first = true;
  }
}

void
demultiplexed_reads::add_unidentified_1(fastq&& read)
{
  m_cache.at(0)->reads_1.push_back(std::move(read));
}

void
demultiplexed_reads::add_unidentified_2(fastq&& read)
{
  m_cache.at(0)->reads_2.push_back(std::move(read));
}

void
demultiplexed_reads::add_read_1(fastq&& read, size_t sample, size_t barcode)
{
  auto& chunk = *m_cache.at(sample + 1);
  chunk.reads_1.push_back(std::move(read));
  chunk.barcodes.push_back(barcode);
}

void
demultiplexed_reads::add_read_2(fastq&& read, size_t sample)
{
  m_cache.at(sample + 1)->reads_2.push_back(std::move(read));
}

chunk_vec
demultiplexed_reads::flush(bool eof, char mate_separator)
{
  chunk_vec output;

  // Unidentified reads; these are treated as a pseudo-sample for simplicity
  flush_chunk(output, m_cache.at(0), m_steps.unidentified, eof, mate_separator);

  for (size_t i = 1; i < m_cache.size(); ++i) {
    flush_chunk(output,
                m_cache.at(i),
                m_steps.samples.at(i - 1),
                eof,
                mate_separator);
  }

  return output;
}

} // namespace adapterremoval

1	// SPDX-License-Identifier: GPL-3.0-or-later
2	// SPDX-FileCopyrightText: 2024 Mikkel Schubert <mikkelsch@gmail.com>
3	#include "output.hpp" // declarations
4	#include "buffer.hpp" // for buffer
5	#include "debug.hpp" // for AR_REQUIRE, AR_FAIL
6	#include "fastq.hpp" // for fastq
7	#include "fastq_io.hpp" // for write_fastq, split_fastq, gzip_split_fastq
8	#include "scheduler.hpp" // for analytical_chunk
9	#include "serializer.hpp" // for read_meta
10	#include "strutils.hpp" // for ends_with
11	#include <limits> // for numeric_limits
12
13	namespace adapterremoval {
14
15	class userconfig;
16
17	namespace {
18
19	buffer&
20	get_buffer(chunk_ptr& chunk)	×
21	{
22	AR_REQUIRE(chunk);	×
23	if (chunk->buffers.empty()) {	×
24	chunk->buffers.emplace_back();	×
25	}
26
27	return chunk->buffers.back();	×
28	}
29
30	} // namespace
31
32	////////////////////////////////////////////////////////////////////////////////
33	// Implementations for `sample_output_files`
34
35	const size_t sample_output_files::disabled = std::numeric_limits<size_t>::max();
36
37	size_t
38	add_write_step(scheduler& sch,	×
39	const userconfig& config,
40	const output_file& file)
41	{
42	AR_REQUIRE(file.name != DEV_NULL);	×
43	size_t step_id = sch.add<write_fastq>(config, file);	×
44
45	switch (file.format) {	×
46	case output_format::fastq:
47	case output_format::sam:
48	break;
49	case output_format::fastq_gzip:	×
50	case output_format::sam_gzip:	×
51	case output_format::bam:	×
52	case output_format::ubam: {	×
53	step_id = sch.add<gzip_split_fastq>(config, file, step_id);	×
54	step_id = sch.add<split_fastq>(config, file, step_id);	×
55	break;	×
56	}
57	default:	×
58	AR_FAIL("invalid output format");	×
59	}
60
61	return step_id;	×
62	}
63
64	sample_output_files::sample_output_files()	×
65	{
66	m_offsets.fill(sample_output_files::disabled);	×
67	}
68
69	void
70	sample_output_files::set_file(const read_file rtype, output_file file)	×
71	{
72	const auto index = static_cast<size_t>(rtype);	×
73	AR_REQUIRE(m_offsets.at(index) == sample_output_files::disabled);	×
74
75	// If the file type isn't being saved, then there is no need to process the
76	// reads. This saves time especially when output compression is enabled.
77	if (file.name != DEV_NULL) {	×
78	// FIXME: This assumes that filesystem is case sensitive
79	auto it = m_output.begin();	×
80	for (; it != m_output.end(); ++it) {	×
81	if (file.name == it->file.name) {	×
82	break;
83	}
84	}
85
86	if (it == m_output.end()) {	×
87	m_output.push_back({ std::move(file), disabled });	×
88	m_offsets.at(index) = m_output.size() - 1;	×
89	} else {
90	AR_REQUIRE(file == it->file);	×
91	const auto existing_index = it - m_output.begin();	×
92	m_offsets.at(index) = existing_index;	×
93	}
94	}
95	}
96
97	void
98	sample_output_files::set_step(read_file rtype, size_t step)	×
99	{
100	const auto index = static_cast<size_t>(rtype);	×
101	const auto offset = m_offsets.at(index);	×
102	AR_REQUIRE(offset != disabled);	×
103
104	AR_REQUIRE(m_output.at(offset).step == disabled);	×
105	m_output.at(offset).step = step;	×
106	}
107
108	size_t
109	sample_output_files::offset(read_file value) const	×
110	{
111	return m_offsets.at(static_cast<size_t>(value));	×
112	}
113
114	////////////////////////////////////////////////////////////////////////////////
115	// Implementations for `output_files`
116
117	const size_t output_files::disabled = static_cast<size_t>(-1);
118
119	bool
120	output_files::parse_format(std::string_view filename, output_format& sink)	×
121	{
122	const std::string value = to_lower(std::string(filename));	×
123	// ubam is tested separately, to avoid supporting non-standard file extensions
124	if (value == "ubam") {	×
125	sink = output_format::ubam;	×
126	return true;	×
127	}
128
129	return parse_extension("." + value, sink);	×
130	}
131
132	bool
133	output_files::parse_extension(std::string_view filename, output_format& sink)	×
134	{
135	const std::string value = to_lower(std::string(filename));	×
136	if (ends_with(value, ".fq.gz") \|\| ends_with(value, ".fastq.gz")) {	×
137	sink = output_format::fastq_gzip;	×
138	} else if (ends_with(value, ".fq") \|\| ends_with(value, ".fastq")) {	×
139	sink = output_format::fastq;	×
140	} else if (ends_with(value, ".sam.gz")) {	×
141	sink = output_format::sam_gzip;	×
142	} else if (ends_with(value, ".sam")) {	×
143	sink = output_format::sam;	×
144	} else if (ends_with(value, ".bam")) {	×
145	sink = output_format::bam;	×
146	} else {
147	return false;
148	}
149
150	return true;
151	}
152
153	std::string_view
154	output_files::file_extension(const output_format format)	×
155	{
156	switch (format) {	×
157	case output_format::fastq:	×
158	return ".fastq";	×
159	case output_format::fastq_gzip:	×
160	return ".fastq.gz";	×
161	case output_format::sam:	×
162	return ".sam";	×
163	case output_format::sam_gzip:	×
164	return ".sam.gz";	×
165	case output_format::bam:	×
166	case output_format::ubam:	×
167	return ".bam";	×
168	default:	×
169	AR_FAIL("invalid output format");	×
170	}
171	}
172
173	void
174	output_files::add_write_steps(scheduler& sch, const userconfig& config)	×
175	{
176	AR_REQUIRE(unidentified_1_step == disabled &&	×
177	unidentified_2_step == disabled);
178
179	for (auto& sample : m_samples) {	×
180	for (auto& it : sample.m_output) {	×
181	AR_REQUIRE(it.step == disabled);	×
182	it.step = add_write_step(sch, config, it.file);	×
183	}
184	}
185
186	if (unidentified_1.name != DEV_NULL) {	×
187	unidentified_1_step = add_write_step(sch, config, unidentified_1);	×
188	}
189
190	if (unidentified_1.name == unidentified_2.name) {	×
191	unidentified_2_step = unidentified_1_step;	×
192	} else if (unidentified_2.name != DEV_NULL) {	×
193	unidentified_2_step = add_write_step(sch, config, unidentified_2);	×
194	}
195	}
196
197	////////////////////////////////////////////////////////////////////////////////
198	// Implementations for `processed_reads`
199
200	processed_reads::processed_reads(const sample_output_files& map)	×
201	: m_map(map)	×
202	{
203	for (size_t i = 0; i < map.size(); ++i) {	×
204	m_chunks.emplace_back(std::make_unique<analytical_chunk>());	×
205	m_serializers.emplace_back(map.format(i));	×
206	}
207	}
208
209	void
210	processed_reads::set_sample(const sample& value)	×
211	{
212	for (auto& it : m_serializers) {	×
213	it.set_sample(value);	×
214	}
215	}
216
217	void
218	processed_reads::set_mate_separator(char value)	×
219	{
220	m_mate_separator = value;	×
221	for (auto& it : m_serializers) {	×
222	it.set_mate_separator(value);	×
223	}
224	}
225
226	void
227	processed_reads::set_demultiplexing_only(bool value)	×
228	{
229	for (auto& it : m_serializers) {	×
230	it.set_demultiplexing_only(value);	×
231	}
232	}
233
234	void
235	processed_reads::write_headers(const string_vec& args)	×
236	{
237	for (size_t i = 0; i < m_chunks.size(); ++i) {	×
238	m_serializers.at(i).header(get_buffer(m_chunks.at(i)), args);	×
239	}
240	}
241
242	void
NEW 243	processed_reads::add(const fastq& read, const read_meta& meta)	×
244	{
NEW 245	const size_t offset = m_map.offset(meta.get_file());	×
246	if (offset != sample_output_files::disabled) {	×
247	auto& buffer = get_buffer(m_chunks.at(offset));	×
NEW 248	m_serializers.at(offset).record(buffer, read, meta);	×
249	m_chunks.at(offset)->reads++;	×
250	}
251	}
252
253	chunk_vec
254	processed_reads::finalize(bool eof)	×
255	{
256	chunk_vec chunks;	×
257
258	for (size_t i = 0; i < m_chunks.size(); ++i) {	×
259	auto& chunk = m_chunks.at(i);	×
260	chunk->mate_separator = m_mate_separator;	×
261	chunk->eof = eof;	×
262
263	chunks.emplace_back(m_map.step(i), std::move(chunk));	×
264	}
265
266	return chunks;	×
267	}	×
268
269	///////////////////////////////////////////////////////////////////////////////
270	// Implementations for `post_demux_steps`
271
272	const size_t post_demux_steps::disabled = output_files::disabled;
273
274	///////////////////////////////////////////////////////////////////////////////
275	// Implementations for `demultiplexed_reads`
276
277	namespace {
278
279	void
280	flush_chunk(chunk_vec& output,	×
281	chunk_ptr& ptr,
282	size_t step,
283	const bool eof,
284	const char mate_separator)
285	{
286	if (eof \|\| ptr->reads >= INPUT_READS) {	×
287	ptr->eof = eof;	×
288	ptr->mate_separator = mate_separator;	×
289	output.emplace_back(step, std::move(ptr));	×
290	ptr = std::make_unique<analytical_chunk>();	×
291	}
292	}
293
294	} // namespace
295
296	demultiplexed_reads::demultiplexed_reads(const post_demux_steps& steps)	×
297	: m_steps(steps)	×
298	{
299	// Position 0 is used for unidentified reads
300	m_cache.push_back(std::make_unique<analytical_chunk>());	×
301
302	for (const auto next_step : m_steps.samples) {	×
303	AR_REQUIRE(next_step != post_demux_steps::disabled);	×
304
305	m_cache.push_back(std::make_unique<analytical_chunk>());	×
306	}
307
308	// The first chunks should have headers written depending on format
309	for (auto& it : m_cache) {	×
310	it->first = true;	×
311	}
312	}
313
314	void
315	demultiplexed_reads::add_unidentified_1(fastq&& read)	×
316	{
317	m_cache.at(0)->reads_1.push_back(std::move(read));	×
318	}
319
320	void
321	demultiplexed_reads::add_unidentified_2(fastq&& read)	×
322	{
323	m_cache.at(0)->reads_2.push_back(std::move(read));	×
324	}
325
326	void
327	demultiplexed_reads::add_read_1(fastq&& read, size_t sample, size_t barcode)	×
328	{
329	auto& chunk = *m_cache.at(sample + 1);	×
330	chunk.reads_1.push_back(std::move(read));	×
331	chunk.barcodes.push_back(barcode);	×
332	}
333
334	void
335	demultiplexed_reads::add_read_2(fastq&& read, size_t sample)	×
336	{
337	m_cache.at(sample + 1)->reads_2.push_back(std::move(read));	×
338	}
339
340	chunk_vec
341	demultiplexed_reads::flush(bool eof, char mate_separator)	×
342	{
343	chunk_vec output;	×
344
345	// Unidentified reads; these are treated as a pseudo-sample for simplicity
346	flush_chunk(output, m_cache.at(0), m_steps.unidentified, eof, mate_separator);	×
347
348	for (size_t i = 1; i < m_cache.size(); ++i) {	×
349	flush_chunk(output,	×
350	m_cache.at(i),	×
351	m_steps.samples.at(i - 1),	×
352	eof,
353	mate_separator);
354	}
355
356	return output;	×
357	}	×
358
359	} // namespace adapterremoval

MikkelSchubert / adapterremoval / #101

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous