• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #101

18 Apr 2025 12:19PM UTC coverage: 67.186% (+1.8%) from 65.404%
#101

push

travis-ci

web-flow
add meta-data class for read serialization (#127)

This simplifies passing of additional information (to be added) that
influences how reads are serialized.

The read_type enum is further more expanded, allowing the file_type
to be derived from the read_type enum by the meta data class.

Test cases were added for the serializers

472 of 531 new or added lines in 8 files covered. (88.89%)

2 existing lines in 1 file now uncovered.

9697 of 14433 relevant lines covered (67.19%)

3063.78 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/output.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2024 Mikkel Schubert <mikkelsch@gmail.com>
3
#include "output.hpp"     // declarations
4
#include "buffer.hpp"     // for buffer
5
#include "debug.hpp"      // for AR_REQUIRE, AR_FAIL
6
#include "fastq.hpp"      // for fastq
7
#include "fastq_io.hpp"   // for write_fastq, split_fastq, gzip_split_fastq
8
#include "scheduler.hpp"  // for analytical_chunk
9
#include "serializer.hpp" // for read_meta
10
#include "strutils.hpp"   // for ends_with
11
#include <limits>         // for numeric_limits
12

13
namespace adapterremoval {
14

15
class userconfig;
16

17
namespace {
18

19
buffer&
20
get_buffer(chunk_ptr& chunk)
×
21
{
22
  AR_REQUIRE(chunk);
×
23
  if (chunk->buffers.empty()) {
×
24
    chunk->buffers.emplace_back();
×
25
  }
26

27
  return chunk->buffers.back();
×
28
}
29

30
} // namespace
31

32
////////////////////////////////////////////////////////////////////////////////
33
// Implementations for `sample_output_files`
34

35
const size_t sample_output_files::disabled = std::numeric_limits<size_t>::max();
36

37
size_t
38
add_write_step(scheduler& sch,
×
39
               const userconfig& config,
40
               const output_file& file)
41
{
42
  AR_REQUIRE(file.name != DEV_NULL);
×
43
  size_t step_id = sch.add<write_fastq>(config, file);
×
44

45
  switch (file.format) {
×
46
    case output_format::fastq:
47
    case output_format::sam:
48
      break;
49
    case output_format::fastq_gzip:
×
50
    case output_format::sam_gzip:
×
51
    case output_format::bam:
×
52
    case output_format::ubam: {
×
53
      step_id = sch.add<gzip_split_fastq>(config, file, step_id);
×
54
      step_id = sch.add<split_fastq>(config, file, step_id);
×
55
      break;
×
56
    }
57
    default:
×
58
      AR_FAIL("invalid output format");
×
59
  }
60

61
  return step_id;
×
62
}
63

64
sample_output_files::sample_output_files()
×
65
{
66
  m_offsets.fill(sample_output_files::disabled);
×
67
}
68

69
void
70
sample_output_files::set_file(const read_file rtype, output_file file)
×
71
{
72
  const auto index = static_cast<size_t>(rtype);
×
73
  AR_REQUIRE(m_offsets.at(index) == sample_output_files::disabled);
×
74

75
  // If the file type isn't being saved, then there is no need to process the
76
  // reads. This saves time especially when output compression is enabled.
77
  if (file.name != DEV_NULL) {
×
78
    // FIXME: This assumes that filesystem is case sensitive
79
    auto it = m_output.begin();
×
80
    for (; it != m_output.end(); ++it) {
×
81
      if (file.name == it->file.name) {
×
82
        break;
83
      }
84
    }
85

86
    if (it == m_output.end()) {
×
87
      m_output.push_back({ std::move(file), disabled });
×
88
      m_offsets.at(index) = m_output.size() - 1;
×
89
    } else {
90
      AR_REQUIRE(file == it->file);
×
91
      const auto existing_index = it - m_output.begin();
×
92
      m_offsets.at(index) = existing_index;
×
93
    }
94
  }
95
}
96

97
void
98
sample_output_files::set_step(read_file rtype, size_t step)
×
99
{
100
  const auto index = static_cast<size_t>(rtype);
×
101
  const auto offset = m_offsets.at(index);
×
102
  AR_REQUIRE(offset != disabled);
×
103

104
  AR_REQUIRE(m_output.at(offset).step == disabled);
×
105
  m_output.at(offset).step = step;
×
106
}
107

108
size_t
109
sample_output_files::offset(read_file value) const
×
110
{
111
  return m_offsets.at(static_cast<size_t>(value));
×
112
}
113

114
////////////////////////////////////////////////////////////////////////////////
115
// Implementations for `output_files`
116

117
const size_t output_files::disabled = static_cast<size_t>(-1);
118

119
bool
120
output_files::parse_format(std::string_view filename, output_format& sink)
×
121
{
122
  const std::string value = to_lower(std::string(filename));
×
123
  // ubam is tested separately, to avoid supporting non-standard file extensions
124
  if (value == "ubam") {
×
125
    sink = output_format::ubam;
×
126
    return true;
×
127
  }
128

129
  return parse_extension("." + value, sink);
×
130
}
131

132
bool
133
output_files::parse_extension(std::string_view filename, output_format& sink)
×
134
{
135
  const std::string value = to_lower(std::string(filename));
×
136
  if (ends_with(value, ".fq.gz") || ends_with(value, ".fastq.gz")) {
×
137
    sink = output_format::fastq_gzip;
×
138
  } else if (ends_with(value, ".fq") || ends_with(value, ".fastq")) {
×
139
    sink = output_format::fastq;
×
140
  } else if (ends_with(value, ".sam.gz")) {
×
141
    sink = output_format::sam_gzip;
×
142
  } else if (ends_with(value, ".sam")) {
×
143
    sink = output_format::sam;
×
144
  } else if (ends_with(value, ".bam")) {
×
145
    sink = output_format::bam;
×
146
  } else {
147
    return false;
148
  }
149

150
  return true;
151
}
152

153
std::string_view
154
output_files::file_extension(const output_format format)
×
155
{
156
  switch (format) {
×
157
    case output_format::fastq:
×
158
      return ".fastq";
×
159
    case output_format::fastq_gzip:
×
160
      return ".fastq.gz";
×
161
    case output_format::sam:
×
162
      return ".sam";
×
163
    case output_format::sam_gzip:
×
164
      return ".sam.gz";
×
165
    case output_format::bam:
×
166
    case output_format::ubam:
×
167
      return ".bam";
×
168
    default:
×
169
      AR_FAIL("invalid output format");
×
170
  }
171
}
172

173
void
174
output_files::add_write_steps(scheduler& sch, const userconfig& config)
×
175
{
176
  AR_REQUIRE(unidentified_1_step == disabled &&
×
177
             unidentified_2_step == disabled);
178

179
  for (auto& sample : m_samples) {
×
180
    for (auto& it : sample.m_output) {
×
181
      AR_REQUIRE(it.step == disabled);
×
182
      it.step = add_write_step(sch, config, it.file);
×
183
    }
184
  }
185

186
  if (unidentified_1.name != DEV_NULL) {
×
187
    unidentified_1_step = add_write_step(sch, config, unidentified_1);
×
188
  }
189

190
  if (unidentified_1.name == unidentified_2.name) {
×
191
    unidentified_2_step = unidentified_1_step;
×
192
  } else if (unidentified_2.name != DEV_NULL) {
×
193
    unidentified_2_step = add_write_step(sch, config, unidentified_2);
×
194
  }
195
}
196

197
////////////////////////////////////////////////////////////////////////////////
198
// Implementations for `processed_reads`
199

200
processed_reads::processed_reads(const sample_output_files& map)
×
201
  : m_map(map)
×
202
{
203
  for (size_t i = 0; i < map.size(); ++i) {
×
204
    m_chunks.emplace_back(std::make_unique<analytical_chunk>());
×
205
    m_serializers.emplace_back(map.format(i));
×
206
  }
207
}
208

209
void
210
processed_reads::set_sample(const sample& value)
×
211
{
212
  for (auto& it : m_serializers) {
×
213
    it.set_sample(value);
×
214
  }
215
}
216

217
void
218
processed_reads::set_mate_separator(char value)
×
219
{
220
  m_mate_separator = value;
×
221
  for (auto& it : m_serializers) {
×
222
    it.set_mate_separator(value);
×
223
  }
224
}
225

226
void
227
processed_reads::set_demultiplexing_only(bool value)
×
228
{
229
  for (auto& it : m_serializers) {
×
230
    it.set_demultiplexing_only(value);
×
231
  }
232
}
233

234
void
235
processed_reads::write_headers(const string_vec& args)
×
236
{
237
  for (size_t i = 0; i < m_chunks.size(); ++i) {
×
238
    m_serializers.at(i).header(get_buffer(m_chunks.at(i)), args);
×
239
  }
240
}
241

242
void
NEW
243
processed_reads::add(const fastq& read, const read_meta& meta)
×
244
{
NEW
245
  const size_t offset = m_map.offset(meta.get_file());
×
246
  if (offset != sample_output_files::disabled) {
×
247
    auto& buffer = get_buffer(m_chunks.at(offset));
×
NEW
248
    m_serializers.at(offset).record(buffer, read, meta);
×
249
    m_chunks.at(offset)->reads++;
×
250
  }
251
}
252

253
chunk_vec
254
processed_reads::finalize(bool eof)
×
255
{
256
  chunk_vec chunks;
×
257

258
  for (size_t i = 0; i < m_chunks.size(); ++i) {
×
259
    auto& chunk = m_chunks.at(i);
×
260
    chunk->mate_separator = m_mate_separator;
×
261
    chunk->eof = eof;
×
262

263
    chunks.emplace_back(m_map.step(i), std::move(chunk));
×
264
  }
265

266
  return chunks;
×
267
}
×
268

269
///////////////////////////////////////////////////////////////////////////////
270
// Implementations for `post_demux_steps`
271

272
const size_t post_demux_steps::disabled = output_files::disabled;
273

274
///////////////////////////////////////////////////////////////////////////////
275
// Implementations for `demultiplexed_reads`
276

277
namespace {
278

279
void
280
flush_chunk(chunk_vec& output,
×
281
            chunk_ptr& ptr,
282
            size_t step,
283
            const bool eof,
284
            const char mate_separator)
285
{
286
  if (eof || ptr->reads >= INPUT_READS) {
×
287
    ptr->eof = eof;
×
288
    ptr->mate_separator = mate_separator;
×
289
    output.emplace_back(step, std::move(ptr));
×
290
    ptr = std::make_unique<analytical_chunk>();
×
291
  }
292
}
293

294
} // namespace
295

296
demultiplexed_reads::demultiplexed_reads(const post_demux_steps& steps)
×
297
  : m_steps(steps)
×
298
{
299
  // Position 0 is used for unidentified reads
300
  m_cache.push_back(std::make_unique<analytical_chunk>());
×
301

302
  for (const auto next_step : m_steps.samples) {
×
303
    AR_REQUIRE(next_step != post_demux_steps::disabled);
×
304

305
    m_cache.push_back(std::make_unique<analytical_chunk>());
×
306
  }
307

308
  // The first chunks should have headers written depending on format
309
  for (auto& it : m_cache) {
×
310
    it->first = true;
×
311
  }
312
}
313

314
void
315
demultiplexed_reads::add_unidentified_1(fastq&& read)
×
316
{
317
  m_cache.at(0)->reads_1.push_back(std::move(read));
×
318
}
319

320
void
321
demultiplexed_reads::add_unidentified_2(fastq&& read)
×
322
{
323
  m_cache.at(0)->reads_2.push_back(std::move(read));
×
324
}
325

326
void
327
demultiplexed_reads::add_read_1(fastq&& read, size_t sample, size_t barcode)
×
328
{
329
  auto& chunk = *m_cache.at(sample + 1);
×
330
  chunk.reads_1.push_back(std::move(read));
×
331
  chunk.barcodes.push_back(barcode);
×
332
}
333

334
void
335
demultiplexed_reads::add_read_2(fastq&& read, size_t sample)
×
336
{
337
  m_cache.at(sample + 1)->reads_2.push_back(std::move(read));
×
338
}
339

340
chunk_vec
341
demultiplexed_reads::flush(bool eof, char mate_separator)
×
342
{
343
  chunk_vec output;
×
344

345
  // Unidentified reads; these are treated as a pseudo-sample for simplicity
346
  flush_chunk(output, m_cache.at(0), m_steps.unidentified, eof, mate_separator);
×
347

348
  for (size_t i = 1; i < m_cache.size(); ++i) {
×
349
    flush_chunk(output,
×
350
                m_cache.at(i),
×
351
                m_steps.samples.at(i - 1),
×
352
                eof,
353
                mate_separator);
354
  }
355

356
  return output;
×
357
}
×
358

359
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc