• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #38

07 Aug 2024 07:47PM UTC coverage: 83.365% (-3.5%) from 86.839%
#38

push

travis-ci

MikkelSchubert
additional tests

2190 of 2627 relevant lines covered (83.37%)

15528.72 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.21
/src/fastq_enc.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk            *
5
 * Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com           *
6
 *                                                                       *
7
 * This program is free software: you can redistribute it and/or modify  *
8
 * it under the terms of the GNU General Public License as published by  *
9
 * the Free Software Foundation, either version 3 of the License, or     *
10
 * (at your option) any later version.                                   *
11
 *                                                                       *
12
 * This program is distributed in the hope that it will be useful,       *
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
15
 * GNU General Public License for more details.                          *
16
 *                                                                       *
17
 * You should have received a copy of the GNU General Public License     *
18
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
19
\*************************************************************************/
20
#include "fastq_enc.hpp" // header
21
#include "debug.hpp"     // for AR_FAIL, AR_REQUIRE
22
#include "errors.hpp"    // for fastq_error
23
#include "strutils.hpp"  // for shell_escape
24
#include <algorithm>     // for min, max
25
#include <array>         // for array
26
#include <cmath>         // for log10, pow, round
27
#include <sstream>       // for ostringstream
28
#include <stdexcept>     // for invalid_argument
29

30
namespace adapterremoval {
31

32
//! Offset used by Phred+33 and SAM encodings
33
const int PHRED_33_OFFSET_MIN = '!';
34
//! The maximum ASCII value allowed for Phred+33 encoded quality scores. This is
35
//! a generous maximum, taking into account that instruments are assigning
36
//! higher and higher quality scores for Phred+33 data.
37
const int PHRED_33_OFFSET_MAX = 'N';
38
//! Minimum Phred score allowed
39
// const int PHRED_33_SCORE_MIN = 0;
40
//! Maximum Phred score allowed
41
const int PHRED_33_SCORE_MAX = PHRED_33_OFFSET_MAX - PHRED_33_OFFSET_MIN;
42

43
//! Offset used by Phred+64 encodings
44
const int PHRED_64_OFFSET_MIN = '@';
45
//! The maximum ASCII value allowed for encoded Phred scores
46
const int PHRED_64_OFFSET_MAX = '~';
47
//! Minimum Phred+64 score allowed
48
const int PHRED_64_SCORE_MIN = 0;
49
//! Maximum Phred+64 score allowed
50
// const int PHRED_64_SCORE_MAX = PHRED_64_OFFSET_MAX - PHRED_64_OFFSET_MIN;
51

52
//! Offset used by Solexa encoding quality scores
53
const int SOLEXA_OFFSET_MIN = '@';
54
//! The maximum ASCII value allowed for encoded Solexa scores
55
const int SOLEXA_OFFSET_MAX = 'h';
56
//! Minimum Phred encoded score allowed
57
const int SOLEXA_SCORE_MIN = -5;
58
//! Maximum Phred encoded score allowed
59
const int SOLEXA_SCORE_MAX = SOLEXA_OFFSET_MAX - SOLEXA_OFFSET_MIN;
60

61
///////////////////////////////////////////////////////////////////////////////
62
// Pre-calculation of Solexa <-> Phred conversions
63

64
std::array<char, 256>
65
calc_solexa_to_phred33()
1✔
66
{
67
  std::array<char, 256> scores = { 0 };
1✔
68

69
  for (int i = SOLEXA_SCORE_MIN; i <= SOLEXA_SCORE_MAX; ++i) {
47✔
70
    const double score = round(10.0 * log10(1.0 + pow(10, (i / 10.0))));
46✔
71
    const int transformed =
46✔
72
      std::max<int>(PHRED_SCORE_MIN, std::min<int>(PHRED_SCORE_MAX, score));
138✔
73
    scores.at(i + SOLEXA_OFFSET_MIN) = PHRED_OFFSET_MIN + transformed;
92✔
74
  }
75

76
  return scores;
1✔
77
}
78

79
const auto g_solexa_to_phred33 = calc_solexa_to_phred33();
80

81
///////////////////////////////////////////////////////////////////////////////
82

83
/** Returns the quality score in a form that is human readable */
84
std::string
85
escape_raw_score(char raw)
20✔
86
{
87
  return shell_escape(std::string(1, raw));
80✔
88
}
89

90
[[noreturn]] void
91
throw_invalid_phred_33(const char raw)
1✔
92
{
93
  const int score = raw - PHRED_33_OFFSET_MIN;
1✔
94
  const int alt_score = raw - PHRED_64_OFFSET_MIN;
1✔
95
  const int max_score = PHRED_33_SCORE_MAX;
1✔
96
  const auto esc = escape_raw_score(raw);
1✔
97
  const auto esc_max = escape_raw_score(PHRED_33_OFFSET_MAX);
1✔
98

99
  std::ostringstream ss;
1✔
100
  ss << "Found Phred+33 encoded quality score of " << score << " (encoded as "
1✔
101
     << esc << "), which is greater than the expected maximum score of "
2✔
102
     << max_score << " (encoded as " << esc_max << "). This suggests that "
1✔
103
     << "input may be Phred+64 encoded.\n\n"
104

105
     << "If the quality scores are actually Phred+64 encoded, which would "
106
     << "mean that the Phred quality score is " << alt_score
2✔
107
     << ", then use the '--quality-format 64' command-line option.\n\n"
108

109
     << "If the quality scores are Phred+33 encoded, but have higher than "
110
        "expected scores, then use '--quality-format sam' to permit the full "
111
        "range of quality scores.\n\n"
112

113
     << "See the documentation for more information.";
1✔
114

115
  throw fastq_error(ss.str());
4✔
116
}
3✔
117

118
[[noreturn]] void
119
throw_invalid_phred_64(const char raw)
3✔
120
{
121
  AR_REQUIRE(raw < SOLEXA_OFFSET_MIN,
3✔
122
             "invalid_phred called on valid PHRED score");
123

124
  const int score = raw - PHRED_64_OFFSET_MIN;
3✔
125
  const int alt_score = raw - PHRED_33_OFFSET_MIN;
3✔
126
  const int min_score = PHRED_64_SCORE_MIN;
3✔
127
  const auto esc = escape_raw_score(raw);
3✔
128
  const auto esc_min = escape_raw_score(PHRED_64_OFFSET_MIN);
3✔
129

130
  std::ostringstream ss;
3✔
131
  ss << "Found Phred+64 encoded quality score of " << score << " (encoded as "
3✔
132
     << esc << "), which is less than the expected minimum score of "
6✔
133
     << min_score << " (encoded as " << esc_min << "). This suggests that "
3✔
134
     << "input may be Phred+33 encoded.\n\n"
135

136
     << "If the quality scores are actually Phred+33 encoded, which would "
137
     << "mean that the Phred quality score is " << alt_score
6✔
138
     << ", then omit the '--quality-format' command-line option or use "
139
     << "'--quality-format 33' to explicitly set the format.\n\n";
3✔
140

141
  if (score >= SOLEXA_SCORE_MIN) {
3✔
142
    ss << "The quality score could also be the older Solexa format, which has "
2✔
143
       << "a minimum score of -5, but data of this type is rare. If it is "
144
       << "actually Solexa encoded FASTQ data, then use the '--quality-format "
145
       << "solexa' command-line option.\n\n";
2✔
146
  }
147

148
  ss << "See the documentation for more information.";
3✔
149

150
  throw fastq_error(ss.str());
12✔
151
}
9✔
152

153
[[noreturn]] void
154
throw_invalid_solexa(const char raw)
2✔
155
{
156
  const int score = raw - SOLEXA_OFFSET_MIN;
2✔
157
  const int alt_score = raw - PHRED_33_OFFSET_MIN;
2✔
158
  const int min_score = SOLEXA_SCORE_MIN;
2✔
159
  const auto esc = escape_raw_score(raw);
2✔
160
  const auto esc_min = escape_raw_score(SOLEXA_OFFSET_MIN + SOLEXA_SCORE_MIN);
2✔
161
  const auto esc_max = escape_raw_score(SOLEXA_OFFSET_MAX);
2✔
162

163
  std::ostringstream ss;
2✔
164
  if (score < SOLEXA_SCORE_MIN) {
2✔
165
    ss << "Found Solexa encoded quality score of " << score << " (encoded as "
1✔
166
       << esc << "), which is less than the expected minimum score of "
2✔
167
       << min_score << " (encoded as " << esc_min << "). This suggests that "
1✔
168
       << "input may be Phred+33 encoded.\n\n"
169

170
       << "If the quality scores are actually Phred+33 encoded, which would "
171
       << "mean that the Phred quality score is " << alt_score
2✔
172
       << ", then omit the '--quality-format' command-line option or use "
173
       << "'--quality-format 33' to explicitly set the format.\n\n"
174

175
       << "See the documentation for more information.";
1✔
176
  } else if (raw > SOLEXA_OFFSET_MAX) {
1✔
177
    ss << "Found Solexa encoded quality score of " << score << " (encoded as "
1✔
178
       << esc << "), which is greater than the expected maximum score of "
2✔
179
       << SOLEXA_SCORE_MAX << " (encoded as " << esc_max << ").\n\n"
1✔
180

181
       << "See the documentation for more information.";
2✔
182
  } else {
183
    AR_FAIL("invalid_phred called on valid PHRED score");
×
184
  }
185

186
  throw fastq_error(ss.str());
8✔
187
}
8✔
188

189
[[noreturn]] void
190
throw_invalid_score(const quality_encoding encoding, const char raw_score)
12✔
191
{
192
  if (raw_score < PHRED_OFFSET_MIN || raw_score > PHRED_OFFSET_MAX) {
12✔
193
    std::ostringstream ss;
6✔
194

195
    ss << "Found raw FASTQ quality score of " << static_cast<int>(raw_score)
6✔
196
       << " (" << escape_raw_score(raw_score) << "). This is outside of the "
12✔
197
       << "range of valid ASCII encoded quality scores (" << PHRED_OFFSET_MIN
12✔
198
       << " to " << PHRED_OFFSET_MAX << "), meaning that the input file is "
6✔
199
       << "either corrupt or not in FASTQ format!";
6✔
200

201
    throw fastq_error(ss.str());
24✔
202
  }
6✔
203

204
  switch (encoding) {
6✔
205
    case quality_encoding::phred_33:
1✔
206
      throw_invalid_phred_33(raw_score);
1✔
207

208
    case quality_encoding::phred_64:
3✔
209
      throw_invalid_phred_64(raw_score);
3✔
210

211
    case quality_encoding::solexa:
2✔
212
      throw_invalid_solexa(raw_score);
2✔
213

214
    case quality_encoding::sam:
×
215
      AR_FAIL("This case should have been handled by initial check");
×
216

217
    default:
×
218
      AR_FAIL("Invalid quality encoding");
×
219
  }
220
}
221

222
///////////////////////////////////////////////////////////////////////////////
223

224
fastq_encoding::fastq_encoding(quality_encoding encoding) noexcept
36✔
225
  : m_encoding(encoding)
36✔
226
  , m_offset_min()
36✔
227
  , m_offset_max()
36✔
228
{
229
  switch (encoding) {
36✔
230
    case quality_encoding::phred_33:
9✔
231
      m_offset_min = PHRED_33_OFFSET_MIN;
9✔
232
      m_offset_max = PHRED_33_OFFSET_MAX;
9✔
233
      break;
9✔
234

235
    case quality_encoding::phred_64:
9✔
236
      m_offset_min = PHRED_64_OFFSET_MIN;
9✔
237
      m_offset_max = PHRED_64_OFFSET_MAX;
9✔
238
      break;
9✔
239

240
    case quality_encoding::solexa:
9✔
241
      m_offset_min = SOLEXA_OFFSET_MIN;
9✔
242
      m_offset_max = SOLEXA_OFFSET_MAX;
9✔
243
      break;
9✔
244

245
    case quality_encoding::sam:
9✔
246
      m_offset_min = PHRED_OFFSET_MIN;
9✔
247
      m_offset_max = PHRED_OFFSET_MAX;
9✔
248
      break;
9✔
249

250
    default:
×
251
      AR_FAIL("unknown encoding");
×
252
  }
253
}
36✔
254

255
void
256
fastq_encoding::decode(std::string& qualities) const
1,204✔
257
{
258
  if (m_encoding == quality_encoding::solexa) {
1,204✔
259
    for (auto& quality : qualities) {
94✔
260
      const char current = quality;
20✔
261
      // TODO: Handle negative values, e.g. รจ
262
      if (!(quality = g_solexa_to_phred33.at(quality))) {
40✔
263
        throw_invalid_score(m_encoding, current);
2✔
264
      }
265
    }
266
  } else {
267
    for (auto& quality : qualities) {
25,761✔
268
      if (quality < m_offset_min || quality > m_offset_max) {
6,995✔
269
        throw_invalid_score(m_encoding, quality);
10✔
270
      }
271

272
      // Convert Phred+64 to Phred+33 if needed
273
      quality -= m_offset_min - PHRED_33_OFFSET_MIN;
6,985✔
274
    }
275
  }
276
}
1,192✔
277

278
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc