• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #45

20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%
#45

push

travis-ci

web-flow
attempt to fix coveralls run

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/output.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2024 by Mikkel Schubert - mikkelsch@gmail.com           *
5
 *                                                                       *
6
 * This program is free software: you can redistribute it and/or modify  *
7
 * it under the terms of the GNU General Public License as published by  *
8
 * the Free Software Foundation, either version 3 of the License, or     *
9
 * (at your option) any later version.                                   *
10
 *                                                                       *
11
 * This program is distributed in the hope that it will be useful,       *
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
14
 * GNU General Public License for more details.                          *
15
 *                                                                       *
16
 * You should have received a copy of the GNU General Public License     *
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
18
\*************************************************************************/
19
#include "output.hpp"     // declarations
20
#include "buffer.hpp"     // for buffer
21
#include "debug.hpp"      // for AR_REQUIRE, AR_FAIL
22
#include "fastq.hpp"      // for fastq
23
#include "fastq_io.hpp"   // for write_fastq, split_fastq, gzip_split_fastq
24
#include "scheduler.hpp"  // for analytical_chunk
25
#include "serializer.hpp" // for fastq_serialzier
26
#include "strutils.hpp"   // for ends_with
27
#include <limits>         // for numeric_limits
28

29
namespace adapterremoval {
30

31
class userconfig;
32

33
namespace {
34

35
buffer&
36
get_buffer(chunk_ptr& chunk)
×
37
{
38
  AR_REQUIRE(chunk);
×
39
  if (chunk->buffers.empty()) {
×
40
    chunk->buffers.emplace_back();
×
41
  }
42

43
  return chunk->buffers.back();
×
44
}
45

46
} // namespace
47

48
////////////////////////////////////////////////////////////////////////////////
49
// Implementations for `sample_output_files`
50

51
const size_t sample_output_files::disabled = std::numeric_limits<size_t>::max();
52

53
size_t
54
add_write_step(scheduler& sch,
×
55
               const userconfig& config,
56
               const output_file& file)
57
{
58
  AR_REQUIRE(file.name != DEV_NULL);
×
59
  size_t step_id = sch.add<write_fastq>(config, file);
×
60

61
  switch (file.format) {
×
62
    case output_format::fastq:
63
    case output_format::sam:
64
      break;
65
    case output_format::fastq_gzip:
×
66
    case output_format::sam_gzip:
×
67
    case output_format::bam:
×
68
    case output_format::ubam: {
×
69
      step_id = sch.add<gzip_split_fastq>(config, file, step_id);
×
70
      step_id = sch.add<split_fastq>(config, file, step_id);
×
71
      break;
×
72
    }
73
    default:
×
74
      AR_FAIL("invalid output format");
×
75
  }
76

77
  return step_id;
×
78
}
79

80
sample_output_files::sample_output_files()
×
81
{
82
  m_offsets.fill(sample_output_files::disabled);
×
83
}
84

85
void
86
sample_output_files::set_file(const read_type rtype, output_file file)
×
87
{
88
  const auto index = static_cast<size_t>(rtype);
×
89
  AR_REQUIRE(m_offsets.at(index) == sample_output_files::disabled);
×
90

91
  // If the file type isn't being saved, then there is no need to process the
92
  // reads. This saves time especially when output compression is enabled.
93
  if (file.name != DEV_NULL) {
×
94
    // FIXME: This assumes that filesystem is case sensitive
95
    auto it = m_output.begin();
×
96
    for (; it != m_output.end(); ++it) {
×
97
      if (file.name == it->file.name) {
×
98
        break;
99
      }
100
    }
101

102
    if (it == m_output.end()) {
×
103
      m_output.push_back({ std::move(file), disabled });
×
104
      m_offsets.at(index) = m_output.size() - 1;
×
105
    } else {
106
      AR_REQUIRE(file == it->file);
×
107
      const auto existing_index = it - m_output.begin();
×
108
      m_offsets.at(index) = existing_index;
×
109
    }
110
  }
111
}
112

113
void
114
sample_output_files::set_step(read_type rtype, size_t step)
×
115
{
116
  const auto index = static_cast<size_t>(rtype);
×
117
  const auto offset = m_offsets.at(index);
×
118
  AR_REQUIRE(offset != disabled);
×
119

120
  AR_REQUIRE(m_output.at(offset).step == disabled);
×
121
  m_output.at(offset).step = step;
×
122
}
123

124
size_t
125
sample_output_files::offset(read_type value) const
×
126
{
127
  return m_offsets.at(static_cast<size_t>(value));
×
128
}
129

130
////////////////////////////////////////////////////////////////////////////////
131
// Implementations for `output_files`
132

133
const size_t output_files::disabled = static_cast<size_t>(-1);
134

135
bool
136
output_files::parse_format(std::string_view filename, output_format& sink)
×
137
{
138
  const std::string value = to_lower(std::string(filename));
×
139
  // ubam is tested separately, to avoid supporting non-standard file extensions
140
  if (value == "ubam") {
×
141
    sink = output_format::ubam;
×
142
    return true;
×
143
  }
144

145
  return parse_extension("." + value, sink);
×
146
}
147

148
bool
149
output_files::parse_extension(std::string_view filename, output_format& sink)
×
150
{
151
  const std::string value = to_lower(std::string(filename));
×
152
  if (ends_with(value, ".fq.gz") || ends_with(value, ".fastq.gz")) {
×
153
    sink = output_format::fastq_gzip;
×
154
  } else if (ends_with(value, ".fq") || ends_with(value, ".fastq")) {
×
155
    sink = output_format::fastq;
×
156
  } else if (ends_with(value, ".sam.gz")) {
×
157
    sink = output_format::sam_gzip;
×
158
  } else if (ends_with(value, ".sam")) {
×
159
    sink = output_format::sam;
×
160
  } else if (ends_with(value, ".bam")) {
×
161
    sink = output_format::bam;
×
162
  } else {
163
    return false;
164
  }
165

166
  return true;
167
}
168

169
std::string_view
170
output_files::file_extension(const output_format format)
×
171
{
172
  switch (format) {
×
173
    case output_format::fastq:
×
174
      return ".fastq";
×
175
    case output_format::fastq_gzip:
×
176
      return ".fastq.gz";
×
177
    case output_format::sam:
×
178
      return ".sam";
×
179
    case output_format::sam_gzip:
×
180
      return ".sam.gz";
×
181
    case output_format::bam:
×
182
    case output_format::ubam:
×
183
      return ".bam";
×
184
    default:
×
185
      AR_FAIL("invalid output format");
×
186
  }
187
}
188

189
void
190
output_files::add_write_steps(scheduler& sch, const userconfig& config)
×
191
{
192
  AR_REQUIRE(unidentified_1_step == disabled &&
×
193
             unidentified_2_step == disabled);
194

195
  for (auto& sample : m_samples) {
×
196
    for (auto& it : sample.m_output) {
×
197
      AR_REQUIRE(it.step == disabled);
×
198
      it.step = add_write_step(sch, config, it.file);
×
199
    }
200
  }
201

202
  if (unidentified_1.name != DEV_NULL) {
×
203
    unidentified_1_step = add_write_step(sch, config, unidentified_1);
×
204
  }
205

206
  if (unidentified_1.name == unidentified_2.name) {
×
207
    unidentified_2_step = unidentified_1_step;
×
208
  } else if (unidentified_2.name != DEV_NULL) {
×
209
    unidentified_2_step = add_write_step(sch, config, unidentified_2);
×
210
  }
211
}
212

213
////////////////////////////////////////////////////////////////////////////////
214
// Implementations for `processed_reads`
215

216
processed_reads::processed_reads(const sample_output_files& map)
×
217
  : m_map(map)
×
218
{
219
  for (size_t i = 0; i < map.size(); ++i) {
×
220
    m_chunks.emplace_back(std::make_unique<analytical_chunk>());
×
221
    m_serializers.emplace_back(map.format(i));
×
222
  }
223
}
224

225
void
226
processed_reads::set_sample(const sample& value)
×
227
{
228
  for (auto& ser : m_serializers) {
×
229
    ser.set_sample(value);
×
230
  }
231
}
232

233
void
234
processed_reads::set_mate_separator(char value)
×
235
{
236
  m_mate_separator = value;
×
237
  for (auto& ser : m_serializers) {
×
238
    ser.set_mate_separator(value);
×
239
  }
240
}
241

242
void
243
processed_reads::set_demultiplexing_only(bool value)
×
244
{
245
  for (auto& ser : m_serializers) {
×
246
    ser.set_demultiplexing_only(value);
×
247
  }
248
}
249

250
void
251
processed_reads::write_headers(const string_vec& args)
×
252
{
253
  for (size_t i = 0; i < m_chunks.size(); ++i) {
×
254
    m_serializers.at(i).header(get_buffer(m_chunks.at(i)), args);
×
255
  }
256
}
257

258
void
259
processed_reads::add(const fastq& read,
×
260
                     const read_type type,
261
                     const fastq_flags flags,
262
                     const size_t barcode)
263
{
264
  const size_t offset = m_map.offset(type);
×
265
  if (offset != sample_output_files::disabled) {
×
266
    auto& buffer = get_buffer(m_chunks.at(offset));
×
267
    m_serializers.at(offset).record(buffer, read, flags, barcode);
×
268
    m_chunks.at(offset)->reads++;
×
269
  }
270
}
271

272
chunk_vec
273
processed_reads::finalize(bool eof)
×
274
{
275
  chunk_vec chunks;
×
276

277
  for (size_t i = 0; i < m_chunks.size(); ++i) {
×
278
    auto& chunk = m_chunks.at(i);
×
279
    chunk->mate_separator = m_mate_separator;
×
280
    chunk->eof = eof;
×
281

282
    chunks.emplace_back(m_map.step(i), std::move(chunk));
×
283
  }
284

285
  return chunks;
×
286
}
×
287

288
///////////////////////////////////////////////////////////////////////////////
289
// Implementations for `post_demux_steps`
290

291
const size_t post_demux_steps::disabled = output_files::disabled;
292

293
///////////////////////////////////////////////////////////////////////////////
294
// Implementations for `demultiplexed_reads`
295

296
namespace {
297

298
void
299
flush_chunk(chunk_vec& output,
×
300
            chunk_ptr& ptr,
301
            size_t step,
302
            const bool eof,
303
            const char mate_separator)
304
{
305
  if (eof || ptr->reads >= INPUT_READS) {
×
306
    ptr->eof = eof;
×
307
    ptr->mate_separator = mate_separator;
×
308
    output.emplace_back(step, std::move(ptr));
×
309
    ptr = std::make_unique<analytical_chunk>();
×
310
  }
311
}
312

313
} // namespace
314

315
demultiplexed_reads::demultiplexed_reads(const post_demux_steps& steps)
×
316
  : m_steps(steps)
×
317
{
318
  // Position 0 is used for unidentified reads
319
  m_cache.push_back(std::make_unique<analytical_chunk>());
×
320

321
  for (const auto next_step : m_steps.samples) {
×
322
    AR_REQUIRE(next_step != post_demux_steps::disabled);
×
323

324
    m_cache.push_back(std::make_unique<analytical_chunk>());
×
325
  }
326

327
  // The first chunks should have headers written depending on format
328
  for (auto& it : m_cache) {
×
329
    it->first = true;
×
330
  }
331
}
332

333
void
334
demultiplexed_reads::add_unidentified_1(fastq&& read)
×
335
{
336
  m_cache.at(0)->reads_1.push_back(std::move(read));
×
337
}
338

339
void
340
demultiplexed_reads::add_unidentified_2(fastq&& read)
×
341
{
342
  m_cache.at(0)->reads_2.push_back(std::move(read));
×
343
}
344

345
void
346
demultiplexed_reads::add_read_1(fastq&& read, size_t sample, size_t barcode)
×
347
{
348
  auto& chunk = *m_cache.at(sample + 1);
×
349
  chunk.reads_1.push_back(std::move(read));
×
350
  chunk.barcodes.push_back(barcode);
×
351
}
352

353
void
354
demultiplexed_reads::add_read_2(fastq&& read, size_t sample)
×
355
{
356
  m_cache.at(sample + 1)->reads_2.push_back(std::move(read));
×
357
}
358

359
chunk_vec
360
demultiplexed_reads::flush(bool eof, char mate_separator)
×
361
{
362
  chunk_vec output;
×
363

364
  // Unidentified reads; these are treated as a pseudo-sample for simplicity
365
  flush_chunk(output, m_cache.at(0), m_steps.unidentified, eof, mate_separator);
×
366

367
  for (size_t i = 1; i < m_cache.size(); ++i) {
×
368
    flush_chunk(
×
369
      output, m_cache.at(i), m_steps.samples.at(i - 1), eof, mate_separator);
×
370
  }
371

372
  return output;
×
373
}
×
374

375
} // namespace adapterremoval
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc