• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #101

18 Apr 2025 12:19PM UTC coverage: 67.186% (+1.8%) from 65.404%
#101

push

travis-ci

web-flow
add meta-data class for read serialization (#127)

This simplifies passing of additional information (to be added) that
influences how reads are serialized.

The read_type enum is further more expanded, allowing the file_type
to be derived from the read_type enum by the meta data class.

Test cases were added for the serializers

472 of 531 new or added lines in 8 files covered. (88.89%)

2 existing lines in 1 file now uncovered.

9697 of 14433 relevant lines covered (67.19%)

3063.78 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.6
/src/serializer.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2024 Mikkel Schubert <mikkelsch@gmail.com>
3
#include "serializer.hpp"  // declarations
4
#include "buffer.hpp"      // for buffer
5
#include "commontypes.hpp" // for output_format
6
#include "debug.hpp"       // for AR_REQUIRE, AR_FAIL
7
#include "errors.hpp"      // for serializing_error
8
#include "fastq.hpp"       // for fastq
9
#include "fastq_enc.hpp"   // for PHRED_OFFSET_MIN
10
#include "main.hpp"        // for VERSION
11
#include "strutils.hpp"    // for join_text
12
#include <sstream>         // for ostringstream
13
#include <string_view>     // for string_view
14

15
namespace adapterremoval {
16

17
using namespace std::literals;
18

19
class userconfig;
20

21
namespace {
22

23
//! Standard header for BAM files prior to compression
24
constexpr std::string_view BAM_HEADER{ "BAM\1", 4 };
25
//! Standard header for SAM/BAM files
26
constexpr std::string_view SAM_HEADER = "@HD\tVN:1.6\tSO:unsorted\n";
27

28
/**
29
 * Flags mapping onto SAM/BAM flags
30
 *
31
 * 0x1 = read paired
32
 * 0x4 = read unmapped
33
 * 0x8 = mate unmapped
34
 * 0x40 = mate 1
35
 * 0x80 = mate 2
36
 * 0x200 = failed QC
37
 */
38

39
std::string_view
40
read_type_to_sam(read_type flags)
42✔
41
{
42
  switch (flags) {
42✔
43
    case read_type::se:
26✔
44
    case read_type::merged:
26✔
45
      return "4";
26✔
46
    case read_type::se_fail:
4✔
47
    case read_type::merged_fail:
4✔
48
      return "516";
4✔
49
    case read_type::pe_1:
4✔
50
    case read_type::singleton_1:
4✔
51
      return "77";
4✔
52
    case read_type::pe_1_fail:
2✔
53
      return "589";
2✔
54
    case read_type::pe_2:
4✔
55
    case read_type::singleton_2:
4✔
56
      return "141";
4✔
57
    case read_type::pe_2_fail:
2✔
58
      return "653";
2✔
NEW
59
    default:                          // GCOVR_EXCL_LINE
×
NEW
60
      AR_FAIL("invalid fastq flags"); // GCOVR_EXCL_LINE
×
61
  }
62
}
63

64
uint16_t
65
read_type_to_bam(read_type flags)
46✔
66
{
67
  switch (flags) {
46✔
68
    case read_type::se:
69
    case read_type::merged:
70
      return 4;
71
    case read_type::se_fail:
72
    case read_type::merged_fail:
73
      return 516;
74
    case read_type::pe_1:
75
    case read_type::singleton_1:
76
      return 77;
77
    case read_type::pe_1_fail:
78
      return 589;
79
    case read_type::pe_2:
80
    case read_type::singleton_2:
81
      return 141;
82
    case read_type::pe_2_fail:
83
      return 653;
NEW
84
    default:                          // GCOVR_EXCL_LINE
×
NEW
85
      AR_FAIL("invalid fastq flags"); // GCOVR_EXCL_LINE
×
86
  }
87
}
88

89
void
90
sequence_to_bam(buffer& buf, const std::string& seq)
46✔
91
{
92
  const auto size = buf.size();
46✔
93

94
  uint8_t pair = 0;
46✔
95
  for (size_t i = 0; i < seq.length(); ++i) {
940✔
96
    pair = (pair << 4) | "\0\1\0\2\10\0\20\4"[seq[i] & 0x7];
424✔
97

98
    if (i % 2) {
424✔
99
      buf.append_u8(pair);
210✔
100
      pair = 0;
210✔
101
    }
102
  }
103

104
  if (seq.length() % 2) {
92✔
105
    buf.append_u8(pair << 4);
4✔
106
  }
107

108
  AR_REQUIRE(buf.size() - size == (seq.length() + 1) / 2);
138✔
109
}
46✔
110

111
void
112
qualities_to_bam(buffer& buf, const std::string& quals)
46✔
113
{
114
  for (const auto c : quals) {
470✔
115
    buf.append_u8(c - PHRED_OFFSET_MIN);
424✔
116
  }
117
}
46✔
118

119
std::string
120
create_sam_header(const string_vec& args, const sample& s)
16✔
121
{
122
  std::string header{ SAM_HEADER };
32✔
123

124
  // @RG
125
  for (const auto& it : s) {
68✔
126
    if (it.has_read_group) {
20✔
127
      header.append(it.read_group_.header());
24✔
128
      header.append("\n");
12✔
129
    }
130
  }
131

132
  // @PG
133
  header.append("@PG\tID:adapterremoval\tPN:adapterremoval\tCL:");
16✔
134
  header.append(join_text(args, " "));
32✔
135
  header.append("\tVN:");
16✔
136
  header.append(VERSION.substr(1)); // version without leading v
32✔
137
  header.append("\n");
16✔
138

139
  return header;
16✔
140
}
×
141

142
std::string_view
143
prepare_name(const fastq& record, const char mate_separator)
96✔
144
{
145
  auto name = record.name(mate_separator);
96✔
146
  if (name.length() > 254) {
96✔
147
    std::ostringstream os;
4✔
148
    os << "Cannot encode read as SAM/BAM; read name is longer than 254 "
4✔
149
       << "characters: len(" << log_escape(name) << ") == " << name.length();
12✔
150

151
    throw serializing_error(os.str());
8✔
152
  }
4✔
153

154
  for (const char c : name) {
1,740✔
155
    if ((c < '!' || c > '?') && (c < 'A' || c > '~')) {
1,560✔
156
      std::ostringstream os;
4✔
157
      os << "Cannot encode read as SAM/BAM; read name contains characters "
4✔
158
         << "other than the allowed [!-?A-~]: " << log_escape(name);
8✔
159

160
      throw serializing_error(os.str());
8✔
161
    }
4✔
162
  }
163

164
  return name;
88✔
165
}
166

167
} // namespace
168

169
///////////////////////////////////////////////////////////////////////////////
170
// Implementations for `fastq_serializer`
171

172
void
173
serializer::fastq_record(buffer& buf,
58✔
174
                         const fastq& record,
175
                         const read_meta& /* meta */,
176
                         const sample_sequences& sequences) const
177
{
178
  buf.append(record.header());
116✔
179
  if (m_demultiplexing_only && !sequences.barcode_1.empty()) {
60✔
180
    buf.append(" BC:");
4✔
181
    buf.append(sequences.barcode_1);
4✔
182
    if (sequences.barcode_2.length()) {
4✔
183
      buf.append_u8('-');
2✔
184
      buf.append(sequences.barcode_2);
6✔
185
    }
186
  }
187
  buf.append_u8('\n');
58✔
188
  buf.append(record.sequence());
116✔
189
  buf.append("\n+\n"sv);
58✔
190
  buf.append(record.qualities());
116✔
191
  buf.append_u8('\n');
58✔
192
}
58✔
193

194
///////////////////////////////////////////////////////////////////////////////
195
// Implementations for `sam_serializer`
196

197
void
198
serializer::sam_header(buffer& buf, const string_vec& args, const sample& s)
8✔
199
{
200
  buf.append(create_sam_header(args, s));
24✔
201
}
8✔
202

203
void
204
serializer::sam_record(buffer& buf,
46✔
205
                       const fastq& record,
206
                       const read_meta& meta,
207
                       const sample_sequences& sequences) const
208
{
209
  buf.append(prepare_name(record, m_mate_separator)); // 1. QNAME
46✔
210
  buf.append_u8('\t');
42✔
211
  buf.append(read_type_to_sam(meta.m_type)); // 2. FLAG
42✔
212
  buf.append("\t"
84✔
213
             "*\t" // 3. RNAME
214
             "0\t" // 4. POS
215
             "0\t" // 5. MAPQ
216
             "*\t" // 6. CIGAR
217
             "*\t" // 7. RNEXT
218
             "0\t" // 8. PNEXT
219
             "0\t" // 9. TLEN
220
  );
221

222
  if (record.length()) {
84✔
223
    buf.append(record.sequence()); // 10. SEQ
80✔
224
    buf.append_u8('\t');
40✔
225
    buf.append(record.qualities()); // 11. QUAL
120✔
226
  } else {
227
    buf.append("*\t" // 10. SEQ
4✔
228
               "*"   // 11. QUAL
229
    );
230
  }
231

232
  if (sequences.has_read_group) {
42✔
233
    buf.append("\tRG:Z:");
16✔
234
    buf.append(sequences.read_group_.id());
24✔
235
  }
236

237
  buf.append("\n");
84✔
238
}
42✔
239

240
///////////////////////////////////////////////////////////////////////////////
241
// Implementations for `bam_serializer`
242

243
void
244
serializer::bam_header(buffer& buf, const string_vec& args, const sample& s)
8✔
245
{
246
  const auto sam_header = create_sam_header(args, s);
8✔
247

248
  buf.append(BAM_HEADER);            // magic
8✔
249
  buf.append_u32(sam_header.size()); // l_text
16✔
250
  buf.append(sam_header);            // terminating NUL not required
8✔
251
  buf.append_u32(0);                 // n_ref
8✔
252
}
8✔
253

254
void
255
serializer::bam_record(buffer& buf,
50✔
256
                       const fastq& record,
257
                       const read_meta& meta,
258
                       const sample_sequences& sequences) const
259
{
260
  const size_t block_size_pos = buf.size();
50✔
261
  buf.append_u32(0);  // block size (preliminary)
50✔
262
  buf.append_i32(-1); // refID
50✔
263
  buf.append_i32(-1); // pos
50✔
264

265
  const auto name = prepare_name(record, m_mate_separator);
50✔
266
  buf.append_u8(name.length() + 1); // l_read_name
46✔
267
  buf.append_u8(0);                 // mapq
46✔
268
  buf.append_u16(4680);             // bin (c.f. specification 4.2.1)
46✔
269
  buf.append_u16(0);                // n_cigar
46✔
270
  buf.append_u16(read_type_to_bam(meta.m_type)); // flags
46✔
271

272
  buf.append_u32(record.length()); // l_seq
92✔
273
  buf.append_i32(-1);              // next_refID
46✔
274
  buf.append_i32(-1);              // next_pos
46✔
275
  buf.append_i32(0);               // tlen
46✔
276

277
  buf.append(name); // read_name + NUL terminator
46✔
278
  buf.append_u8(0);
46✔
279
  // no cigar operations
280
  sequence_to_bam(buf, record.sequence());
92✔
281
  qualities_to_bam(buf, record.qualities());
92✔
282

283
  if (sequences.has_read_group) {
46✔
284
    // RG:Z:${ID} tag
285
    buf.append("RGZ");
16✔
286
    buf.append(sequences.read_group_.id());
16✔
287
    buf.append_u8(0); // NUL
8✔
288
  }
289

290
  const size_t block_size = buf.size() - block_size_pos - 4;
46✔
291
  buf.put_u32(block_size_pos, block_size); // block size (final)
46✔
292
}
46✔
293

294
///////////////////////////////////////////////////////////////////////////////
295
// Implementations for `read_meta`
296

297
read_file
298
read_meta::get_file() const noexcept
10✔
299
{
300
  switch (m_type) {
10✔
301
    case read_type::se:
302
    case read_type::pe_1:
303
      return read_file::mate_1;
304
    case read_type::pe_2:
305
      return read_file::mate_2;
306
    case read_type::se_fail:
307
    case read_type::pe_1_fail:
308
    case read_type::pe_2_fail:
309
    case read_type::merged_fail:
310
      return read_file::discarded;
311
    case read_type::singleton_1:
312
    case read_type::singleton_2:
313
      return read_file::singleton;
314
    case read_type::merged:
315
      return read_file::merged;
NEW
316
    default:                         // GCOVR_EXCL_LINE
×
NEW
317
      AR_FAIL("invalid read flags"); // GCOVR_EXCL_LINE
×
318
  }
319
}
320

321
///////////////////////////////////////////////////////////////////////////////
322
// Implementations for `serializer`
323

324
serializer::serializer(output_format format)
170✔
325
  : m_format(format)
170✔
326
{
327
}
170✔
328

329
void
330
serializer::header(buffer& buf, const string_vec& args) const
22✔
331
{
332
  switch (m_format) {
22✔
333
    case output_format::fastq:
334
    case output_format::fastq_gzip:
335
      // No header
336
      break;
337
    case output_format::sam:
8✔
338
    case output_format::sam_gzip:
8✔
339
      sam_header(buf, args, m_sample);
8✔
340
      break;
8✔
341
    case output_format::bam:
8✔
342
    case output_format::ubam:
8✔
343
      bam_header(buf, args, m_sample);
8✔
344
      break;
8✔
NEW
345
    default:                            // GCOVR_EXCL_LINE
×
NEW
346
      AR_FAIL("invalid output format"); // GCOVR_EXCL_LINE
×
347
  }
348
}
22✔
349

350
void
351
serializer::record(buffer& buf, const fastq& record, read_meta meta) const
154✔
352
{
353
  const auto& sequences = m_sample.at(meta.m_barcode);
154✔
354
  switch (m_format) {
154✔
355
    case output_format::fastq:
58✔
356
    case output_format::fastq_gzip:
58✔
357
      fastq_record(buf, record, meta, sequences);
58✔
358
      break;
58✔
359
    case output_format::sam:
46✔
360
    case output_format::sam_gzip:
46✔
361
      sam_record(buf, record, meta, sequences);
46✔
362
      break;
46✔
363
    case output_format::bam:
50✔
364
    case output_format::ubam:
50✔
365
      bam_record(buf, record, meta, sequences);
50✔
366
      break;
50✔
NEW
367
    default:                            // GCOVR_EXCL_LINE
×
NEW
368
      AR_FAIL("invalid output format"); // GCOVR_EXCL_LINE
×
369
  }
370
}
146✔
371

372
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc