• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #39

20 Aug 2024 01:31PM UTC coverage: 79.602% (-3.8%) from 83.365%
#39

push

travis-ci

MikkelSchubert
update schema URL to use github pages

2279 of 2863 relevant lines covered (79.6%)

14257.45 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.58
/src/fastq_enc.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "fastq_enc.hpp" // header
21
#include "debug.hpp"     // for AR_FAIL, AR_REQUIRE
22
#include "errors.hpp"    // for fastq_error
23
#include "strutils.hpp"  // for shell_escape
24
#include <algorithm>     // for min, max
25
#include <array>         // for array
26
#include <cmath>         // for log10, pow, round
27
#include <sstream>       // for ostringstream
28

29
namespace adapterremoval {
30

31
namespace {
32

33
//! Offset used by Phred+33 and SAM encodings
34
const int PHRED_33_OFFSET_MIN = '!';
35
//! The maximum ASCII value allowed for Phred+33 encoded quality scores. This is
36
//! a generous maximum, taking into account that instruments are assigning
37
//! higher and higher quality scores for Phred+33 data.
38
const int PHRED_33_OFFSET_MAX = 'N';
39
//! Minimum Phred score allowed
40
// const int PHRED_33_SCORE_MIN = 0;
41
//! Maximum Phred score allowed
42
const int PHRED_33_SCORE_MAX = PHRED_33_OFFSET_MAX - PHRED_33_OFFSET_MIN;
43

44
//! Offset used by Phred+64 encodings
45
const int PHRED_64_OFFSET_MIN = '@';
46
//! The maximum ASCII value allowed for encoded Phred scores
47
const int PHRED_64_OFFSET_MAX = '~';
48
//! Minimum Phred+64 score allowed
49
const int PHRED_64_SCORE_MIN = 0;
50
//! Maximum Phred+64 score allowed
51
// const int PHRED_64_SCORE_MAX = PHRED_64_OFFSET_MAX - PHRED_64_OFFSET_MIN;
52

53
//! Offset used by Solexa encoding quality scores
54
const int SOLEXA_OFFSET_MIN = '@';
55
//! The maximum ASCII value allowed for encoded Solexa scores
56
const int SOLEXA_OFFSET_MAX = 'h';
57
//! Minimum Phred encoded score allowed
58
const int SOLEXA_SCORE_MIN = -5;
59
//! Maximum Phred encoded score allowed
60
const int SOLEXA_SCORE_MAX = SOLEXA_OFFSET_MAX - SOLEXA_OFFSET_MIN;
61

62
///////////////////////////////////////////////////////////////////////////////
63
// Pre-calculation of Solexa <-> Phred conversions
64

65
std::array<char, 256>
66
calc_solexa_to_phred33()
1✔
67
{
68
  std::array<char, 256> scores = { 0 };
1✔
69

70
  for (int i = SOLEXA_SCORE_MIN; i <= SOLEXA_SCORE_MAX; ++i) {
47✔
71
    const double score = round(10.0 * log10(1.0 + pow(10, (i / 10.0))));
46✔
72
    const int transformed =
46✔
73
      std::max<int>(PHRED_SCORE_MIN, std::min<int>(PHRED_SCORE_MAX, score));
138✔
74
    scores.at(i + SOLEXA_OFFSET_MIN) = PHRED_OFFSET_MIN + transformed;
92✔
75
  }
76

77
  return scores;
1✔
78
}
79

80
const auto g_solexa_to_phred33 = calc_solexa_to_phred33();
81

82
///////////////////////////////////////////////////////////////////////////////
83

84
/** Returns the quality score in a form that is human readable */
85
std::string
86
escape_raw_score(char raw)
20✔
87
{
88
  return shell_escape(std::string(1, raw));
80✔
89
}
90

91
[[noreturn]] void
92
throw_invalid_phred_33(const char raw)
1✔
93
{
94
  const int score = raw - PHRED_33_OFFSET_MIN;
1✔
95
  const int alt_score = raw - PHRED_64_OFFSET_MIN;
1✔
96
  const int max_score = PHRED_33_SCORE_MAX;
1✔
97
  const auto esc = escape_raw_score(raw);
1✔
98
  const auto esc_max = escape_raw_score(PHRED_33_OFFSET_MAX);
1✔
99

100
  std::ostringstream ss;
1✔
101
  ss << "Found Phred+33 encoded quality score of " << score << " (encoded as "
1✔
102
     << esc << "), which is greater than the expected maximum score of "
2✔
103
     << max_score << " (encoded as " << esc_max << "). This suggests that "
1✔
104
     << "input may be Phred+64 encoded.\n\n"
105

106
     << "If the quality scores are actually Phred+64 encoded, which would "
107
     << "mean that the Phred quality score is " << alt_score
2✔
108
     << ", then use the '--quality-format 64' command-line option.\n\n"
109

110
     << "If the quality scores are Phred+33 encoded, but have higher than "
111
        "expected scores, then use '--quality-format sam' to permit the full "
112
        "range of quality scores.\n\n"
113

114
     << "See the documentation for more information.";
1✔
115

116
  throw fastq_error(ss.str());
4✔
117
}
3✔
118

119
[[noreturn]] void
120
throw_invalid_phred_64(const char raw)
3✔
121
{
122
  AR_REQUIRE(raw < SOLEXA_OFFSET_MIN,
3✔
123
             "invalid_phred called on valid PHRED score");
124

125
  const int score = raw - PHRED_64_OFFSET_MIN;
3✔
126
  const int alt_score = raw - PHRED_33_OFFSET_MIN;
3✔
127
  const int min_score = PHRED_64_SCORE_MIN;
3✔
128
  const auto esc = escape_raw_score(raw);
3✔
129
  const auto esc_min = escape_raw_score(PHRED_64_OFFSET_MIN);
3✔
130

131
  std::ostringstream ss;
3✔
132
  ss << "Found Phred+64 encoded quality score of " << score << " (encoded as "
3✔
133
     << esc << "), which is less than the expected minimum score of "
6✔
134
     << min_score << " (encoded as " << esc_min << "). This suggests that "
3✔
135
     << "input may be Phred+33 encoded.\n\n"
136

137
     << "If the quality scores are actually Phred+33 encoded, which would "
138
     << "mean that the Phred quality score is " << alt_score
6✔
139
     << ", then omit the '--quality-format' command-line option or use "
140
     << "'--quality-format 33' to explicitly set the format.\n\n";
3✔
141

142
  if (score >= SOLEXA_SCORE_MIN) {
3✔
143
    ss << "The quality score could also be the older Solexa format, which has "
2✔
144
       << "a minimum score of -5, but data of this type is rare. If it is "
145
       << "actually Solexa encoded FASTQ data, then use the '--quality-format "
146
       << "solexa' command-line option.\n\n";
2✔
147
  }
148

149
  ss << "See the documentation for more information.";
3✔
150

151
  throw fastq_error(ss.str());
12✔
152
}
9✔
153

154
[[noreturn]] void
155
throw_invalid_solexa(const char raw)
2✔
156
{
157
  const int score = raw - SOLEXA_OFFSET_MIN;
2✔
158
  const int alt_score = raw - PHRED_33_OFFSET_MIN;
2✔
159
  const int min_score = SOLEXA_SCORE_MIN;
2✔
160
  const auto esc = escape_raw_score(raw);
2✔
161
  const auto esc_min = escape_raw_score(SOLEXA_OFFSET_MIN + SOLEXA_SCORE_MIN);
2✔
162
  const auto esc_max = escape_raw_score(SOLEXA_OFFSET_MAX);
2✔
163

164
  std::ostringstream ss;
2✔
165
  if (score < SOLEXA_SCORE_MIN) {
2✔
166
    ss << "Found Solexa encoded quality score of " << score << " (encoded as "
1✔
167
       << esc << "), which is less than the expected minimum score of "
2✔
168
       << min_score << " (encoded as " << esc_min << "). This suggests that "
1✔
169
       << "input may be Phred+33 encoded.\n\n"
170

171
       << "If the quality scores are actually Phred+33 encoded, which would "
172
       << "mean that the Phred quality score is " << alt_score
2✔
173
       << ", then omit the '--quality-format' command-line option or use "
174
       << "'--quality-format 33' to explicitly set the format.\n\n"
175

176
       << "See the documentation for more information.";
1✔
177
  } else if (raw > SOLEXA_OFFSET_MAX) {
1✔
178
    ss << "Found Solexa encoded quality score of " << score << " (encoded as "
1✔
179
       << esc << "), which is greater than the expected maximum score of "
2✔
180
       << SOLEXA_SCORE_MAX << " (encoded as " << esc_max << ").\n\n"
1✔
181

182
       << "See the documentation for more information.";
2✔
183
  } else {
184
    AR_FAIL("invalid_phred called on valid PHRED score");
×
185
  }
186

187
  throw fastq_error(ss.str());
8✔
188
}
8✔
189

190
[[noreturn]] void
191
throw_invalid_score(const quality_encoding encoding, const char raw_score)
12✔
192
{
193
  if (raw_score < PHRED_OFFSET_MIN || raw_score > PHRED_OFFSET_MAX) {
12✔
194
    std::ostringstream ss;
6✔
195

196
    ss << "Found raw FASTQ quality score of " << static_cast<int>(raw_score)
6✔
197
       << " (" << escape_raw_score(raw_score) << "). This is outside of the "
12✔
198
       << "range of valid ASCII encoded quality scores (" << PHRED_OFFSET_MIN
12✔
199
       << " to " << PHRED_OFFSET_MAX << "), meaning that the input file is "
6✔
200
       << "either corrupt or not in FASTQ format!";
6✔
201

202
    throw fastq_error(ss.str());
24✔
203
  }
6✔
204

205
  switch (encoding) {
6✔
206
    case quality_encoding::phred_33:
1✔
207
      throw_invalid_phred_33(raw_score);
1✔
208

209
    case quality_encoding::phred_64:
3✔
210
      throw_invalid_phred_64(raw_score);
3✔
211

212
    case quality_encoding::solexa:
2✔
213
      throw_invalid_solexa(raw_score);
2✔
214

215
    case quality_encoding::sam:
×
216
      AR_FAIL("This case should have been handled by initial check");
×
217

218
    default:
×
219
      AR_FAIL("Invalid quality encoding");
×
220
  }
221
}
222

223
[[noreturn]] void
224
throw_invalid_base(char c)
4✔
225
{
226
  std::ostringstream stream;
4✔
227

228
  switch (c) {
4✔
229
    case 'B': // C / G / T
1✔
230
    case 'D': // A / G / T
1✔
231
    case 'H': // A / C / T
1✔
232
    case 'K': // G / T
1✔
233
    case 'M': // A / C
1✔
234
    case 'R': // A / G
1✔
235
    case 'S': // C / G
1✔
236
    case 'V': // A / C / G
1✔
237
    case 'W': // A / T
1✔
238
    case 'Y': // C / T
1✔
239
      stream << "found degenerate base '" << c << "' in FASTQ sequence, but "
1✔
240
             << "only bases A, C, G, T and N are supported. Use the option "
241
                "--mask-degenerate-bases to convert degenerate bases to N";
1✔
242
      break;
243

244
    case 'U': // Uracils
×
245
      stream << "found uracil (U) in FASTQ sequence, but only bases A, C, G, "
×
246
                "T and N are supported. Use the option --convert-uracils to "
247
                "convert uracils (U) to thymine (T)";
×
248
      break;
249

250
    default:
3✔
251
      stream << "invalid character " << log_escape(std::string(1, c))
18✔
252
             << " found in FASTQ sequence";
6✔
253
      break;
3✔
254
  }
255

256
  throw fastq_error(stream.str());
16✔
257
}
4✔
258

259
void
260
process_nucleotides_strict(std::string& nucleotides)
1,208✔
261
{
262
  for (char& nuc : nucleotides) {
25,947✔
263
    // Fast ASCII letter uppercase
264
    const auto upper_case = nuc & 0xDF;
7,041✔
265
    switch (upper_case) {
7,041✔
266
      case 'A':
7,037✔
267
      case 'C':
7,037✔
268
      case 'G':
7,037✔
269
      case 'T':
7,037✔
270
      case 'N':
7,037✔
271
        nuc = upper_case;
7,037✔
272
        break;
7,037✔
273

274
      default:
4✔
275
        throw_invalid_base(nuc);
4✔
276
    }
277
  }
278
}
1,204✔
279

280
void
281
process_nucleotides_lenient(std::string& nucleotides,
×
282
                            const bool convert_uracil,
283
                            const bool mask_degenerate)
284
{
285
  for (char& nuc : nucleotides) {
×
286
    // Fast ASCII letter uppercase
287
    const auto upper_case = nuc & 0xDF;
×
288
    switch (upper_case) {
×
289
      case 'A':
×
290
      case 'C':
×
291
      case 'G':
×
292
      case 'T':
×
293
      case 'N':
×
294
        nuc = upper_case;
×
295
        break;
×
296

297
      // Uracils
298
      case 'U':
×
299
        if (convert_uracil) {
×
300
          nuc = 'T';
×
301
          break;
×
302
        } else {
303
          throw_invalid_base(nuc);
×
304
        }
305

306
      // IUPAC encoded degenerate bases
307
      case 'B': // C / G / T
×
308
      case 'D': // A / G / T
×
309
      case 'H': // A / C / T
×
310
      case 'K': // G / T
×
311
      case 'M': // A / C
×
312
      case 'R': // A / G
×
313
      case 'S': // C / G
×
314
      case 'V': // A / C / G
×
315
      case 'W': // A / T
×
316
      case 'Y': // C / T
×
317
        if (mask_degenerate) {
×
318
          nuc = 'N';
×
319
          break;
×
320
        } else {
321
          throw_invalid_base(nuc);
×
322
        }
323

324
      default:
×
325
        throw_invalid_base(nuc);
×
326
    }
327
  }
328
}
329

330
} // namespace
331

332
///////////////////////////////////////////////////////////////////////////////
333

334
fastq_encoding::fastq_encoding(quality_encoding encoding,
36✔
335
                               degenerate_encoding degenerate,
336
                               uracil_encoding uracils) noexcept
36✔
337
  : m_mask_degenerate()
36✔
338
  , m_convert_uracil()
36✔
339
  , m_encoding(encoding)
36✔
340
  , m_offset_min()
36✔
341
  , m_offset_max()
36✔
342
{
343
  switch (degenerate) {
36✔
344
    case degenerate_encoding::reject:
36✔
345
      m_mask_degenerate = false;
36✔
346
      break;
36✔
347
    case degenerate_encoding::mask:
×
348
      m_mask_degenerate = true;
×
349
      break;
×
350
    default:
×
351
      AR_FAIL("invalid degenerate encoding value");
×
352
  }
353

354
  switch (uracils) {
36✔
355
    case uracil_encoding::convert:
×
356
      m_convert_uracil = true;
×
357
      break;
×
358
    case uracil_encoding::reject:
36✔
359
      m_convert_uracil = false;
36✔
360
      break;
36✔
361
    default:
×
362
      AR_FAIL("invalid uracil encoding value");
×
363
  }
364

365
  switch (encoding) {
36✔
366
    case quality_encoding::phred_33:
9✔
367
      m_offset_min = PHRED_33_OFFSET_MIN;
9✔
368
      m_offset_max = PHRED_33_OFFSET_MAX;
9✔
369
      break;
9✔
370

371
    case quality_encoding::phred_64:
9✔
372
      m_offset_min = PHRED_64_OFFSET_MIN;
9✔
373
      m_offset_max = PHRED_64_OFFSET_MAX;
9✔
374
      break;
9✔
375

376
    case quality_encoding::solexa:
9✔
377
      m_offset_min = SOLEXA_OFFSET_MIN;
9✔
378
      m_offset_max = SOLEXA_OFFSET_MAX;
9✔
379
      break;
9✔
380

381
    case quality_encoding::sam:
9✔
382
      m_offset_min = PHRED_OFFSET_MIN;
9✔
383
      m_offset_max = PHRED_OFFSET_MAX;
9✔
384
      break;
9✔
385

386
    default:
×
387
      AR_FAIL("unknown quality encoding");
×
388
  }
389
}
36✔
390

391
void
392
fastq_encoding::process_nucleotides(std::string& sequence) const
1,208✔
393
{
394
  if (m_mask_degenerate || m_convert_uracil) {
1,208✔
395
    process_nucleotides_lenient(sequence, m_convert_uracil, m_mask_degenerate);
×
396
  } else {
397
    process_nucleotides_strict(sequence);
1,208✔
398
  }
399
}
1,204✔
400

401
void
402
fastq_encoding::process_qualities(std::string& qualities) const
1,204✔
403
{
404
  if (m_encoding == quality_encoding::solexa) {
1,204✔
405
    for (auto& quality : qualities) {
94✔
406
      const char current = quality;
20✔
407
      // TODO: Handle negative values, e.g. รจ
408
      if (!(quality = g_solexa_to_phred33.at(quality))) {
40✔
409
        throw_invalid_score(m_encoding, current);
2✔
410
      }
411
    }
412
  } else {
413
    for (auto& quality : qualities) {
25,761✔
414
      if (quality < m_offset_min || quality > m_offset_max) {
6,995✔
415
        throw_invalid_score(m_encoding, quality);
10✔
416
      }
417

418
      // Convert Phred+64 to Phred+33 if needed
419
      quality -= m_offset_min - PHRED_33_OFFSET_MIN;
6,985✔
420
    }
421
  }
422
}
1,192✔
423

424
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc