• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #42

24 Aug 2024 03:21PM UTC coverage: 79.763% (+0.2%) from 79.602%
#42

push

travis-ci

MikkelSchubert
update changelog

add v2.3.4 and fix some minor issues

2286 of 2866 relevant lines covered (79.76%)

14253.24 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

6.52
/src/serializer.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2024 by Mikkel Schubert - mikkelsch@gmail.com           *
5
 *                                                                       *
6
 * This program is free software: you can redistribute it and/or modify  *
7
 * it under the terms of the GNU General Public License as published by  *
8
 * the Free Software Foundation, either version 3 of the License, or     *
9
 * (at your option) any later version.                                   *
10
 *                                                                       *
11
 * This program is distributed in the hope that it will be useful,       *
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
14
 * GNU General Public License for more details.                          *
15
 *                                                                       *
16
 * You should have received a copy of the GNU General Public License     *
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
18
\*************************************************************************/
19
#include "serializer.hpp"  // for fastq_serialzier
20
#include "buffer.hpp"      // for buffer
21
#include "commontypes.hpp" // for output_format
22
#include "debug.hpp"       // for AR_REQUIRE, AR_FAIL
23
#include "fastq.hpp"       // for fastq
24
#include "fastq_enc.hpp"   // for PHRED_OFFSET_MIN
25
#include "main.hpp"        // for VERSION
26
#include "strutils.hpp"    // for join_text
27
#include <string_view>     // for string_view
28

29
namespace adapterremoval {
30

31
class userconfig;
32

33
namespace {
34

35
//! Standard header for BAM files prior to compression
36
constexpr std::string_view BAM_HEADER{ "BAM\1", 4 };
37
//! Standard header for SAM/BAM files
38
constexpr std::string_view SAM_HEADER = "@HD\tVN:1.6\tSO:unsorted\n";
39

40
/**
41
 * Flags mapping onto SAM/BAM flags
42
 *
43
 * 0x1 = read paired
44
 * 0x4 = read unmapped
45
 * 0x8 = mate unmapped
46
 * 0x40 = mate 1
47
 * 0x80 = mate 2
48
 * 0x200 = failed QC
49
 */
50

51
std::string_view
52
flags_to_sam(fastq_flags flags)
×
53
{
54
  switch (flags) {
×
55
    case fastq_flags::se:
×
56
      return "4";
×
57
    case fastq_flags::se_fail:
×
58
      return "516";
×
59
    case fastq_flags::pe_1:
×
60
      return "77";
×
61
    case fastq_flags::pe_1_fail:
×
62
      return "589";
×
63
    case fastq_flags::pe_2:
×
64
      return "141";
×
65
    case fastq_flags::pe_2_fail:
×
66
      return "653";
×
67
    default:
×
68
      AR_FAIL("invalid fastq flags");
×
69
  }
70
}
71

72
uint16_t
73
flags_to_bam(fastq_flags flags)
×
74
{
75
  switch (flags) {
×
76
    case fastq_flags::se:
77
      return 4;
78
    case fastq_flags::se_fail:
79
      return 516;
80
    case fastq_flags::pe_1:
81
      return 77;
82
    case fastq_flags::pe_1_fail:
83
      return 589;
84
    case fastq_flags::pe_2:
85
      return 141;
86
    case fastq_flags::pe_2_fail:
87
      return 653;
88
    default:
×
89
      AR_FAIL("invalid fastq flags");
×
90
  }
91
}
92

93
void
94
sequence_to_bam(buffer& buf, const std::string& seq)
×
95
{
96
  const auto size = buf.size();
×
97

98
  uint8_t pair = 0;
×
99
  for (size_t i = 0; i < seq.length(); ++i) {
×
100
    pair = (pair << 4) | "\0\1\0\2\10\0\20\4"[seq[i] & 0x7];
×
101

102
    if (i % 2) {
×
103
      buf.append_u8(pair);
×
104
      pair = 0;
×
105
    }
106
  }
107

108
  if (seq.length() % 2) {
×
109
    buf.append_u8(pair << 4);
×
110
  }
111

112
  AR_REQUIRE(buf.size() - size == (seq.length() + 1) / 2);
×
113
}
114

115
void
116
qualities_to_bam(buffer& buf, const std::string& quals)
×
117
{
118
  for (const auto c : quals) {
×
119
    buf.append_u8(c - PHRED_OFFSET_MIN);
×
120
  }
121
}
122

123
std::string
124
create_sam_header(const string_vec& args, const read_group& rg)
×
125
{
126
  std::string header{ SAM_HEADER };
×
127

128
  // @RG
129
  header.append(rg.header());
×
130
  header.append("\n");
×
131

132
  // @PG
133
  header.append("@PG\tID:adapterremoval\tPN:adapterremoval\tCL:");
×
134
  header.append(join_text(args, " "));
×
135
  header.append("\tVN:");
×
136
  header.append(VERSION.substr(1)); // version without leading v
×
137
  header.append("\n");
×
138

139
  return header;
×
140
}
×
141

142
/** Unescapes the escape sequences "\\" and "\t" in a read-group string */
143
std::string
144
unescape_read_group(std::string_view value)
×
145
{
146
  std::string result;
×
147

148
  bool in_escape = false;
×
149
  for (auto c : value) {
×
150
    if (in_escape) {
×
151
      if (c == '\\') {
×
152
        result.push_back('\\');
×
153
      } else if (c == 't') {
×
154
        result.push_back('\t');
×
155
      } else {
156
        throw std::invalid_argument("invalid escape sequence " +
×
157
                                    log_escape(std::string("\\") + c));
×
158
      }
159

160
      in_escape = false;
161
    } else if (c == '\\') {
×
162
      in_escape = true;
163
    } else {
164
      result.push_back(c);
×
165
    }
166
  }
167

168
  if (in_escape) {
×
169
    throw std::invalid_argument("incomplete escape sequence at end of string");
×
170
  }
171

172
  return result;
×
173
}
×
174

175
constexpr bool
176
is_ascii_alpha(const char c)
×
177
{
178
  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
×
179
}
180

181
constexpr bool
182
is_ascii_alphanum(const char c)
×
183
{
184
  return is_ascii_alpha(c) || (c >= '0' && c <= '9');
×
185
}
186

187
} // namespace
188

189
///////////////////////////////////////////////////////////////////////////////
190
// Implementations for `read_group`
191

192
read_group::read_group()
2✔
193
  : m_header{ "@RG\tID:1\tPG:adapterremoval" }
4✔
194
  , m_id{ "ID:1" }
4✔
195
{
196
}
2✔
197

198
read_group::read_group(std::string_view value_)
×
199
  : m_header{ "@RG" }
×
200
{
201
  using invalid = std::invalid_argument;
×
202
  auto value = unescape_read_group(value_);
×
203

204
  // It's not unreasonable to except users to try to specify a full @RG line
205
  if (starts_with(value, "@RG\t")) {
×
206
    value = value.substr(4);
×
207
  }
208

209
  std::string program{ "PG:adapterremoval" };
×
210

211
  if (!value.empty()) {
×
212
    for (const auto& field : split_text(value, '\t')) {
×
213
      for (const auto c : field) {
×
214
        if (c < ' ' || c > '~') {
×
215
          throw invalid("only characters in the range ' ' to '~' are allowed");
×
216
        }
217
      }
218

219
      if (field.size() < 4) {
×
220
        throw invalid("tags must be at least 4 characters long");
×
221
      } else if (!is_ascii_alpha(field.at(0))) {
×
222
        throw invalid("first character in tag name must be a letter");
×
223
      } else if (!is_ascii_alphanum(field.at(1))) {
×
224
        throw invalid(
×
225
          "second character in tag name must be a letter or number");
×
226
      } else if (field.at(2) != ':') {
×
227
        throw invalid("third character in tag must be a colon");
×
228
      } else if (starts_with(field, "ID:")) {
×
229
        if (!m_id.empty()) {
×
230
          throw invalid("multiple ID tags found");
×
231
        }
232

233
        m_id = field.substr(3);
×
234
      } else if (starts_with(field, "PG:")) {
×
235
        // Allow the user to override the PG field, just in case
236
        program = field;
×
237
      }
238
    }
239
  }
240

241
  // Generic read-group key if the user did not supply one
242
  if (m_id.empty()) {
×
243
    m_id = "1";
×
244
    m_header += "\tID:1";
×
245
  }
246

247
  if (!value.empty()) {
×
248
    m_header += "\t";
×
249
    m_header += value;
×
250
  }
251

252
  m_header.append("\t");
×
253
  m_header.append(program);
×
254
}
255

256
///////////////////////////////////////////////////////////////////////////////
257
// Implementations for `fastq_serializer`
258

259
void
260
fastq_serializer::header(buffer& /* buf */,
×
261
                         const string_vec& /* args */,
262
                         const read_group& /* rg */)
263
{
264
}
265

266
void
267
fastq_serializer::record(buffer& buf,
2✔
268
                         const fastq& record,
269
                         fastq_flags /* flags */,
270
                         char /* mate_separator */,
271
                         const read_group& /* rg */)
272
{
273
  buf.append(record.header());
6✔
274
  buf.append_u8('\n');
2✔
275
  buf.append(record.sequence());
6✔
276
  buf.append("\n+\n", 3);
2✔
277
  buf.append(record.qualities());
6✔
278
  buf.append_u8('\n');
2✔
279
}
2✔
280

281
///////////////////////////////////////////////////////////////////////////////
282
// Implementations for `sam_serializer`
283

284
void
285
sam_serializer::header(buffer& buf,
×
286
                       const string_vec& args,
287
                       const read_group& rg)
288
{
289
  buf.append(create_sam_header(args, rg));
×
290
}
291

292
void
293
sam_serializer::record(buffer& buf,
×
294
                       const fastq& record,
295
                       fastq_flags flags,
296
                       char mate_separator,
297
                       const read_group& rg)
298
{
299
  buf.append(record.name(mate_separator)); // 1. QNAME
×
300
  buf.append_u8('\t');
×
301
  buf.append(flags_to_sam(flags)); // 2. FLAG
×
302
  buf.append("\t"
×
303
             "*\t" // 3. RNAME
304
             "0\t" // 4. POS
305
             "0\t" // 5. MAPQ
306
             "*\t" // 6. CIGAR
307
             "*\t" // 7. RNEXT
308
             "0\t" // 8. PNEXT
309
             "0\t" // 9. TLEN
310
  );
311
  if (record.length()) {
×
312
    buf.append(record.sequence()); // 10. SEQ
×
313
    buf.append_u8('\t');
×
314
    buf.append(record.qualities()); // 11. QUAL
×
315
  } else {
316
    buf.append("*\t" // 10. SEQ
×
317
               "*"   // 11. QUAL
318
    );
319
  }
320

321
  buf.append("\tRG:Z:");
×
322
  buf.append(rg.id());
×
323
  buf.append("\tPG:Z:adapterremoval\n");
×
324
}
325

326
///////////////////////////////////////////////////////////////////////////////
327
// Implementations for `bam_serializer`
328

329
void
330
bam_serializer::header(buffer& buf,
×
331
                       const string_vec& args,
332
                       const read_group& rg)
333
{
334
  const auto sam_header = create_sam_header(args, rg);
×
335

336
  buf.append(BAM_HEADER);            // magic
×
337
  buf.append_u32(sam_header.size()); // l_text
×
338
  buf.append(sam_header);            // terminating NUL not required
×
339
  buf.append_u32(0);                 // n_ref
×
340
}
341

342
void
343
bam_serializer::record(buffer& buf,
×
344
                       const fastq& record,
345
                       fastq_flags flags,
346
                       char mate_separator,
347
                       const read_group& rg)
348
{
349
  const size_t block_size_pos = buf.size();
×
350
  buf.append_u32(0);  // block size (preliminary)
×
351
  buf.append_i32(-1); // refID
×
352
  buf.append_i32(-1); // pos
×
353

354
  const auto name = record.name(mate_separator).substr(0, 255);
×
355
  buf.append_u8(name.length() + 1);    // l_read_name
×
356
  buf.append_u8(0);                    // mapq
×
357
  buf.append_u16(4680);                // bin (c.f. specification 4.2.1)
×
358
  buf.append_u16(0);                   // n_cigar
×
359
  buf.append_u16(flags_to_bam(flags)); // flags
×
360

361
  buf.append_u32(record.length()); // l_seq
×
362
  buf.append_i32(-1);              // next_refID
×
363
  buf.append_i32(-1);              // next_pos
×
364
  buf.append_i32(0);               // tlen
×
365

366
  buf.append(name); // read_name + NUL terminator
×
367
  buf.append_u8(0);
×
368
  // no cigar operations
369
  sequence_to_bam(buf, record.sequence());
×
370
  qualities_to_bam(buf, record.qualities());
×
371

372
  // PG:Z:adapterremoval tag
373
  buf.append("RGZ");
×
374
  buf.append(rg.id());
×
375
  buf.append_u8(0); // NUL
×
376

377
  // PG:Z:adapterremoval tag
378
  buf.append("PGZadapterremoval");
×
379
  buf.append_u8(0); // NUL
×
380

381
  const size_t block_size = buf.size() - block_size_pos - 4;
×
382
  buf.put_u32(block_size_pos, block_size); // block size (final)
×
383
}
384

385
///////////////////////////////////////////////////////////////////////////////
386
// Implementations for `serializer`
387

388
serializer::serializer(output_format format)
×
389
{
390
  switch (format) {
×
391
    case output_format::fastq:
×
392
    case output_format::fastq_gzip:
×
393
      m_header = fastq_serializer::header;
×
394
      m_record = fastq_serializer::record;
×
395
      break;
×
396
    case output_format::sam:
×
397
    case output_format::sam_gzip:
×
398
      m_header = sam_serializer::header;
×
399
      m_record = sam_serializer::record;
×
400
      break;
×
401
    case output_format::bam:
×
402
    case output_format::ubam:
×
403
      m_header = bam_serializer::header;
×
404
      m_record = bam_serializer::record;
×
405
      break;
×
406
    default:
×
407
      AR_FAIL("invalid output format");
×
408
  }
409
}
410

411
void
412
serializer::header(buffer& buf, const string_vec& args) const
×
413
{
414
  m_header(buf, args, m_read_group);
×
415
}
416

417
void
418
serializer::record(buffer& buf,
×
419
                   const fastq& record,
420
                   const fastq_flags flags) const
421
{
422
  m_record(buf, record, flags, m_mate_separator, m_read_group);
×
423
}
424

425
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc