• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #46

27 Nov 2024 03:10PM UTC coverage: 27.245% (+1.0%) from 26.244%
#46

push

travis-ci

MikkelSchubert
fix convenience executable make target

2609 of 9576 relevant lines covered (27.25%)

4268.73 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/reports_html.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2022 by Mikkel Schubert - mikkelsch@gmail.com           *
5
 *                                                                       *
6
 * This program is free software: you can redistribute it and/or modify  *
7
 * it under the terms of the GNU General Public License as published by  *
8
 * the Free Software Foundation, either version 3 of the License, or     *
9
 * (at your option) any later version.                                   *
10
 *                                                                       *
11
 * This program is distributed in the hope that it will be useful,       *
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
14
 * GNU General Public License for more details.                          *
15
 *                                                                       *
16
 * You should have received a copy of the GNU General Public License     *
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
18
\*************************************************************************/
19
#include "adapter_id.hpp"            // for adapter_id_statistics
20
#include "counts.hpp"                // for counts, indexed_count, counts_tmpl
21
#include "debug.hpp"                 // for AR_REQUIRE
22
#include "fastq.hpp"                 // for ACGT, ACGT::values, fastq, ACGTN
23
#include "json.hpp"                  // for json_dict, json_list, json_ptr
24
#include "logging.hpp"               // for log_stream, error
25
#include "main.hpp"                  // for VERSION, NAME
26
#include "managed_io.hpp"            // for managed_io
27
#include "output.hpp"                // for DEV_NULL, output_files
28
#include "reports.hpp"               // for write_html_report
29
#include "reports_template_html.hpp" // for html_frequency_plot, html_demultiple...
30
#include "sequence_sets.hpp"         // for adapter_set
31
#include "simd.hpp"                  // for size_t
32
#include "statistics.hpp"            // for fastq_stats_ptr, fastq_statistics
33
#include "strutils.hpp"              // for format_percentage, format_rough...
34
#include "userconfig.hpp"            // for userconfig, ar_command, DEV_NULL
35
#include <algorithm>                 // for max
36
#include <cctype>                    // for toupper
37
#include <cerrno>                    // for errno
38
#include <cmath>                     // for fmod
39
#include <cstdint>                   // for uint64_t
40
#include <cstring>                   // for size_t, strerror
41
#include <iomanip>                   // for operator<<, setprecision, setw
42
#include <memory>                    // for __shared_ptr_access, shared_ptr
43
#include <sstream>                   // for ostringstream
44
#include <string>                    // for string, operator==, to_string
45
#include <utility>                   // for pair
46
#include <vector>                    // for vector
47

48
namespace adapterremoval {
49

50
namespace {
51

52
using fastq_stats_vec = std::vector<fastq_stats_ptr>;
53
using template_ptr = std::unique_ptr<html_template>;
54

55
//! Size chosen to allow fitting two pages side-by-side on a 1920 width display
56
const char* const FIGURE_WIDTH = "736";
57
//! Per figure width for two-column facet figures; approximate
58
const char* const FACET_WIDTH_2 = "351";
59
//! Per figure width for one-column facet figures; approximate
60
const char* const FACET_WIDTH_1 = FIGURE_WIDTH;
61

62
////////////////////////////////////////////////////////////////////////////////
63

64
/** Escapes a string that needs to be embedded in a JS */
65
std::string
66
json_encode(const std::string& s)
×
67
{
68
  return json_token::from_str(s)->to_string();
×
69
}
70

71
/** JSON escaped string */
72
std::string
73
operator""_json(const char* s, size_t length)
×
74
{
75
  return json_encode(std::string(s, length));
×
76
}
77

78
std::string
79
runtime_to_str(double seconds)
×
80
{
81
  std::ostringstream ss;
×
82

83
  if (seconds >= 3600.0) {
×
84
    ss << static_cast<size_t>(seconds / 3600.0) << " "
×
85
       << (seconds >= 7200.0 ? "hours, " : "hour, ") << std::setw(2);
×
86
  }
87

88
  if (seconds >= 60.0) {
×
89
    auto minutes = static_cast<size_t>(std::fmod(seconds, 3600.0) / 60.0);
×
90
    ss << minutes << " "
×
91
       << ((!minutes || minutes >= 120) ? "minutes" : "minute") << ", and "
×
92
       << std::setw(4);
×
93
  }
94

95
  ss << std::fixed << std::setprecision(1) << std::fmod(seconds, 60.0)
×
96
     << " seconds";
×
97

98
  return ss.str();
×
99
}
100

101
std::string
102
mean_of_bp_counts(const counts& count)
×
103
{
104
  auto reads = count.sum();
×
105
  auto bases = count.product();
×
106

107
  if (!reads) {
×
108
    return "NA";
×
109
  }
110

111
  if (bases % reads == 0) {
×
112
    return std::to_string(bases / reads) + " bp";
×
113
  }
114

115
  std::ostringstream ss;
×
116
  ss << std::fixed << std::setprecision(1)
×
117
     << (bases / static_cast<double>(reads)) << " bp";
×
118

119
  return ss.str();
×
120
}
121

122
/**
123
 * VEGA-lite will omit plots if there are no values; this function therefore
124
 * ensures that at least one value is written for a given measurement.
125
 */
126
template<typename T>
127
counts_tmpl<T>
128
require_values(counts_tmpl<T> r, T fallback = T())
×
129
{
130
  if (r.size()) {
×
131
    return r;
×
132
  }
133

134
  return counts_tmpl<T>({ fallback });
×
135
}
136

137
std::string
138
format_average_bases(const reads_and_bases& counts)
×
139
{
140
  const auto reads = counts.reads();
×
141

142
  if (reads) {
×
143
    return format_fraction(counts.bases(), reads, 1) + " bp";
×
144
  } else {
145
    return "NA";
×
146
  }
147
}
148

149
////////////////////////////////////////////////////////////////////////////////
150

151
class io_summary_writer
152
{
153
public:
154
  enum class io
155
  {
156
    input,
157
    output
158
  };
159

160
  io_summary_writer(std::ostream& output, const io type)
×
161
    : m_output(output)
×
162
    , m_type(type)
×
163

164
  {
165
  }
166

167
  void write_head(const std::string& title, const std::string& href)
×
168
  {
169
    html_summary_io_head().set_title(title).set_href(href).write(m_output);
×
170
  }
171

172
  void write_row(const std::string& title, const fastq_statistics& stats)
×
173
  {
174
    const auto n_reads = (m_type == io::input) ? stats.number_of_input_reads()
×
175
                                               : stats.number_of_output_reads();
×
176
    const auto total = stats.quality_dist().sum();
×
177

178
    html_summary_io_row()
×
179
      .set_name(title)
×
180
      .set_n_reads(format_rough_number(n_reads))
×
181
      .set_n_bases(format_rough_number(stats.length_dist().product()))
×
182
      .set_lengths(mean_of_bp_counts(stats.length_dist()))
×
183
      .set_q30(format_percentage(stats.quality_dist().sum(30), total))
×
184
      .set_q20(format_percentage(stats.quality_dist().sum(20), total))
×
185
      .set_ns(format_percentage(stats.nucleotides_pos('N').sum(), total))
×
186
      .set_gc(format_percentage(stats.nucleotides_gc_pos().sum(), total))
×
187
      .write(m_output);
×
188
  }
189

190
  void write_tail() { html_summary_io_tail().write(m_output); }
×
191

192
private:
193
  std::ostream& m_output;
194
  io m_type;
195
};
196

197
std::string
198
build_base_qualities(const fastq_stats_vec& reads, const string_vec& names)
×
199
{
200
  json_list qualities;
×
201

202
  for (size_t i = 0; i < reads.size(); ++i) {
×
203
    const auto& stats = *reads.at(i);
×
204

205
    auto total_quality = stats.qualities_pos();
×
206
    auto total_bases = stats.nucleotides_pos();
×
207

208
    for (const auto nucleotide : ACGT::values) {
×
209
      const auto nucleotides = stats.nucleotides_pos(nucleotide);
×
210
      const auto quality = stats.qualities_pos(nucleotide);
×
211

212
      auto dict = qualities.dict();
×
213
      dict->str("read", names.at(i));
×
214
      dict->i64("offset", 1);
×
215
      dict->str("group", std::string(1, ::toupper(nucleotide)));
×
216
      dict->f64_vec("y", quality / nucleotides);
×
217
    }
218

219
    auto dict = qualities.dict();
×
220
    dict->str("read", names.at(i));
×
221
    dict->i64("offset", 1);
×
222
    dict->str("group", "Mean");
×
223

224
    // Ensure that values get written, to prevent the plot being omitted
225
    dict->f64_vec("y", require_values(total_quality / total_bases));
×
226
  }
227

228
  return qualities.to_string();
×
229
}
230

231
std::string
232
build_quality_distribution(const fastq_stats_vec& reads,
×
233
                           const string_vec& names)
234
{
235
  json_list data;
×
236

237
  for (size_t i = 0; i < reads.size(); ++i) {
×
238
    const auto& stats = reads.at(i);
×
239
    auto count = stats->quality_dist().trim();
×
240
    // A max that should give a uniform look to most data
241
    count.resize_up_to(44);
×
242

243
    const auto m = data.dict();
×
244
    m->str("group", names.at(i));
×
245
    m->i64("offset", 0);
×
246
    m->i64_vec("y", count);
×
247
  }
248

249
  return data.to_string();
×
250
}
251

252
std::string
253
build_base_content(const fastq_stats_vec& reads, const string_vec& names)
×
254
{
255
  json_list content;
×
256

257
  for (size_t i = 0; i < reads.size(); ++i) {
×
258
    const auto& stats = *reads.at(i);
×
259

260
    auto total_bases = stats.nucleotides_pos();
×
261

262
    for (const auto nucleotide : ACGTN::values) {
×
263
      const auto bases = stats.nucleotides_pos(nucleotide);
×
264

265
      const auto dict = content.dict();
×
266
      dict->str("read", names.at(i));
×
267
      dict->i64("offset", 1);
×
268
      dict->str("group", std::string(1, nucleotide));
×
269

270
      // Ensure that values get written, to prevent the plot being omitted
271
      dict->f64_vec("y", require_values(bases / total_bases));
×
272
    }
273

274
    {
×
275
      const auto gc_content = stats.nucleotides_gc_pos();
×
276
      auto dict = content.dict();
×
277
      dict->str("read", names.at(i));
×
278
      dict->i64("offset", 1);
×
279
      dict->str("group", "GC");
×
280

281
      // Ensure that values get written, to prevent the plot being omitted
282
      dict->f64_vec("y", require_values(gc_content / total_bases));
×
283
    }
284
  }
285

286
  return content.to_string();
×
287
}
288

289
////////////////////////////////////////////////////////////////////////////////
290
// Main sections
291

292
void
293
write_html_sampling_note(const userconfig& config,
×
294
                         const std::string& label,
295
                         const fastq_statistics& stats,
296
                         std::ostream& output)
297
{
298
  if (config.report_sample_rate < 1.0) {
×
299
    html_sampling_note()
×
300
      .set_label(label)
×
301
      .set_reads(format_rough_number((stats.number_of_sampled_reads())))
×
302
      .set_pct(format_percentage(stats.number_of_sampled_reads(),
×
303
                                 stats.number_of_input_reads()))
×
304
      .write(output);
×
305
  }
306
}
307

308
void
309
write_html_summary_section(const userconfig& config,
×
310
                           const statistics& stats,
311
                           std::ostream& output)
312
{
313
  html_head().set_title(config.report_title).write(output);
×
314

315
  html_body_start().set_title(config.report_title).write(output);
×
316

317
  // Basic information about the executable / call
318
  {
×
319
    html_summary()
×
320
      .set_date_and_time(userconfig::start_time)
×
321
      .set_version(VERSION)
×
322
      .set_command(shell_escape_command(config.args))
×
323
      .set_runtime(runtime_to_str(config.runtime()))
×
324
      .write(output);
×
325
  }
326

327
  fastq_statistics output_1;
×
328
  fastq_statistics output_2;
×
329
  fastq_statistics merged;
×
330
  fastq_statistics singleton;
×
331
  fastq_statistics discarded;
×
332

333
  for (const auto& it : stats.trimming) {
×
334
    output_1 += *it->read_1;
×
335
    output_2 += *it->read_2;
×
336
    merged += *it->merged;
×
337
    singleton += *it->singleton;
×
338
    discarded += *it->discarded;
×
339
  }
340

341
  if (config.paired_ended_mode) {
×
342
    // Summary statistics for input files
343
    {
×
344
      fastq_statistics totals;
×
345
      totals += *stats.input_1;
×
346
      totals += *stats.input_2;
×
347

348
      io_summary_writer summary(output, io_summary_writer::io::input);
×
349
      summary.write_head("Input", "summary-input");
×
350
      if (config.paired_ended_mode) {
×
351
        summary.write_row("Summary", totals);
×
352
        summary.write_row("File 1", *stats.input_1);
×
353
        summary.write_row("File 2", *stats.input_2);
×
354
      }
355
      summary.write_tail();
×
356

357
      write_html_sampling_note(config, "input", totals, output);
×
358
    }
359

360
    // Summary statistics for output files
361
    if (config.run_type != ar_command::report_only) {
×
362
      fastq_statistics totals;
×
363
      totals += output_1;
×
364
      totals += output_2;
×
365
      totals += merged;
×
366
      totals += singleton;
×
367
      // discarded reads not counted in the output
368
      // totals += discarded;
369

370
      io_summary_writer summary{ output, io_summary_writer::io::output };
×
371
      summary.write_head("Output", "summary-output");
×
372
      summary.write_row("Passed*", totals);
×
373
      if (config.paired_ended_mode) {
×
374
        summary.write_row("File 1", output_1);
×
375
        summary.write_row("File 2", output_2);
×
376

377
        if (config.is_read_merging_enabled()) {
×
378
          summary.write_row("Merged", merged);
×
379
        }
380

381
        if (config.is_any_filtering_enabled()) {
×
382
          summary.write_row("Singleton", singleton);
×
383
        }
384
      }
385

386
      if (config.is_any_filtering_enabled()) {
×
387
        summary.write_row("Discarded*", discarded);
×
388
      }
389
      summary.write_tail();
×
390

391
      write_html_sampling_note(config, "output", totals, output);
×
392

393
      // Note regarding passed / discarded reads
394
      html_output_footnote()
×
395
        .set_symbol("*")
×
396
        .set_html("The <b>Passed</b> column includes all read types except "
×
397
                  "for <b>Discarded</b> reads.")
398
        .write(output);
×
399
    }
400
  } else if (config.run_type == ar_command::report_only) {
×
401
    io_summary_writer summary{ output, io_summary_writer::io::input };
×
402
    summary.write_head("Input summary", "summary-input");
×
403
    summary.write_row("Input", *stats.input_1);
×
404
    summary.write_tail();
×
405

406
    write_html_sampling_note(config, "input", *stats.input_1, output);
×
407
  }
408

409
  else {
410
    io_summary_writer summary{ output, io_summary_writer::io::input };
×
411
    summary.write_head("Input/Output summary", "summary-input-output");
×
412
    summary.write_row("Input", *stats.input_1);
×
413
    summary.write_row("Output", output_1);
×
414
    if (config.is_any_filtering_enabled()) {
×
415
      summary.write_row("Discarded*", discarded);
×
416
    }
417
    summary.write_tail();
×
418

419
    fastq_statistics totals;
×
420
    totals += *stats.input_1;
×
421
    totals += output_1;
×
422

423
    write_html_sampling_note(config, "input/output", totals, output);
×
424

425
    if (config.is_any_filtering_enabled()) {
×
426
      // Note regarding discarded reads in output
427
      html_output_footnote()
×
428
        .set_symbol("*")
×
429
        .set_html("<b>Discarded</b> reads are not included in the "
×
430
                  "<b>Output</b> column.")
431
        .write(output);
×
432
    }
433
  }
434
}
435

436
//! Trimming statistics
437
struct trimming_stats
438
{
439
  size_t id;
440
  //! Processing stage relative to adapter trimming (pre, X, post)
441
  std::string stage;
442
  //! Row label 1 (step)
443
  std::string label_1;
444
  //! Row label 1 (sub-step)
445
  std::string label_2;
446
  //! Whether or not this step is enabled by command-line options
447
  bool enabled;
448
  //! Number of reads/bases trimmed/filtered
449
  reads_and_bases count;
450
};
451

452
void
453
write_html_trimming_stats(std::ostream& output,
×
454
                          const std::vector<trimming_stats>& stats,
455
                          const reads_and_bases& totals)
456
{
457
  size_t n_processing_steps = 0;
×
458
  size_t n_processing_steps_on = 0;
×
459
  size_t n_filtering_steps = 0;
×
460
  size_t n_filtering_steps_on = 0;
×
461

462
  size_t last_id = -1;
×
463
  size_t last_enabled = -1;
×
464
  for (const auto& it : stats) {
×
465
    if (it.id != last_id) {
×
466
      if (it.stage == "Processing") {
×
467
        n_processing_steps++;
×
468
      } else if (it.stage == "Filtering") {
×
469
        n_filtering_steps++;
×
470
      }
471

472
      last_id = it.id;
×
473
    }
474

475
    if (it.enabled && it.id != last_enabled) {
×
476
      if (it.stage == "Processing") {
×
477
        n_processing_steps_on++;
×
478
      } else if (it.stage == "Filtering") {
×
479
        n_filtering_steps_on++;
×
480
      }
481

482
      last_enabled = it.id;
×
483
    }
484
  }
485

486
  html_summary_trimming_head().write(output);
×
487

488
  std::string previous_stage;
×
489
  std::string previous_label_1;
×
490

491
  for (const auto& it : stats) {
×
492
    if (it.enabled) {
×
493
      const auto label_1 = it.label_1 == previous_label_1 ? "" : it.label_1;
×
494
      const auto stage = it.stage == previous_stage ? "" : it.stage;
×
495

496
      previous_stage = it.stage;
×
497
      previous_label_1 = it.label_1;
×
498

499
      html_summary_trimming_row()
×
500
        .set_stage(stage)
×
501
        .set_label_1(label_1)
×
502
        .set_label_2(it.label_2)
×
503
        .set_reads(format_rough_number(it.count.reads()))
×
504
        .set_pct_reads(format_percentage(it.count.reads(), totals.reads()))
×
505
        .set_bases(format_rough_number(it.count.bases()))
×
506
        .set_pct_bases(format_percentage(it.count.bases(), totals.bases()))
×
507
        .set_avg_bases(format_average_bases(it.count))
×
508
        .write(output);
×
509
    }
510
  }
511

512
  html_summary_trimming_tail()
×
513
    .set_n_enabled_filt(std::to_string(n_filtering_steps_on))
×
514
    .set_n_total_filt(std::to_string(n_filtering_steps))
×
515
    .set_n_enabled_proc(std::to_string(n_processing_steps_on))
×
516
    .set_n_total_proc(std::to_string(n_processing_steps))
×
517
    .write(output);
×
518
}
519

520
//! Filtering statistics
521
struct filtering_stats
522
{
523
  //! Filtering step
524
  std::string label;
525
  //! Whether or not this step is enabled by command-line options
526
  bool enabled;
527
  //! Number of reads/bases trimmed/filtered
528
  reads_and_bases count;
529
};
530

531
reads_and_bases
532
summarize_input(const fastq_stats_ptr& ptr)
×
533
{
534
  const auto n_bases = ptr->length_dist().product();
×
535
  AR_REQUIRE(n_bases >= 0);
×
536

537
  return reads_and_bases{ ptr->number_of_input_reads(),
×
538
                          static_cast<uint64_t>(n_bases) };
539
}
540

541
void
542
build_polyx_trimming_rows(std::vector<trimming_stats>& out,
×
543
                          const std::string& polyx_nucleotides,
544
                          const indexed_count<ACGT>& reads,
545
                          const indexed_count<ACGT>& bases,
546
                          const size_t id)
547
{
548
  for (const auto nucleotide : ACGT::values) {
×
549
    out.push_back(
×
550
      { id,
551
        "Processing",
552
        "Poly-X tails",
553
        std::string(1, nucleotide),
554
        polyx_nucleotides.find(nucleotide) != std::string::npos,
×
555
        reads_and_bases(reads.get(nucleotide), bases.get(nucleotide)) });
×
556
  }
557

558
  out.push_back({ id,
×
559
                  "Processing",
560
                  "Poly-X tails",
561
                  "*",
562
                  polyx_nucleotides.size() > 1,
×
563
                  reads_and_bases(reads.sum(), bases.sum()) });
×
564
}
565

566
void
567
write_html_processing_section(const userconfig& config,
×
568
                              const statistics& stats,
569
                              std::ostream& output)
570
{
571
  trimming_statistics totals;
×
572
  for (const auto& it : stats.trimming) {
×
573
    totals += *it;
×
574
  }
575

576
  uint64_t adapter_reads = 0;
×
577
  uint64_t adapter_bases = 0;
×
578

579
  for (size_t i = 0; i < config.samples.adapters().size(); ++i) {
×
580
    adapter_reads += totals.adapter_trimmed_reads.get(i);
×
581
    adapter_bases += totals.adapter_trimmed_bases.get(i);
×
582
  }
583

584
  const auto total_input =
×
585
    summarize_input(stats.input_1) + summarize_input(stats.input_2);
×
586

587
  reads_and_bases total_output;
×
588
  for (const auto& it : stats.trimming) {
×
589
    total_output += summarize_input(it->read_1);
×
590
    total_output += summarize_input(it->read_2);
×
591
    total_output += summarize_input(it->singleton);
×
592
    total_output += summarize_input(it->merged);
×
593
  }
594

595
  // Trimming steps prior to adapter trimming
596
  size_t step_id = 0;
×
597
  std::vector<trimming_stats> trimming = {
×
598
    { step_id++, "Input", "Raw reads", "-", true, total_input },
×
599
    { step_id++,
×
600
      "Processing",
601
      "Terminal bases",
602
      "-",
603
      config.is_terminal_base_pre_trimming_enabled(),
×
604
      totals.terminal_pre_trimmed },
605
  };
606

607
  build_polyx_trimming_rows(trimming,
×
608
                            config.pre_trim_poly_x,
×
609
                            totals.poly_x_pre_trimmed_reads,
610
                            totals.poly_x_pre_trimmed_bases,
611
                            step_id++);
612

613
  trimming.push_back({ step_id++,
×
614
                       "Processing",
615
                       "Adapters",
616
                       "-",
617
                       config.is_adapter_trimming_enabled(),
×
618
                       reads_and_bases(adapter_reads, adapter_bases) });
619

620
  trimming.push_back({ step_id++,
×
621
                       "Processing",
622
                       "Merging",
623
                       "-",
624
                       config.is_read_merging_enabled(),
×
625
                       totals.reads_merged });
626

627
  trimming.push_back({ step_id++,
×
628
                       "Processing",
629
                       "Terminal bases",
630
                       "-",
631
                       config.is_terminal_base_post_trimming_enabled(),
×
632
                       totals.terminal_post_trimmed });
633

634
  build_polyx_trimming_rows(trimming,
×
635
                            config.post_trim_poly_x,
×
636
                            totals.poly_x_post_trimmed_reads,
637
                            totals.poly_x_post_trimmed_bases,
638
                            step_id++);
639

640
  trimming.push_back({ step_id++,
×
641
                       "Processing",
642
                       "Low quality bases",
643
                       "-",
644
                       config.is_low_quality_trimming_enabled(),
×
645
                       totals.low_quality_trimmed });
646

647
  trimming.push_back({ step_id++,
×
648
                       "Filtering",
649
                       "Short reads",
650
                       "-",
651
                       config.is_short_read_filtering_enabled(),
×
652
                       totals.filtered_min_length });
653

654
  trimming.push_back({ step_id++,
×
655
                       "Filtering",
656
                       "Long reads",
657
                       "-",
658
                       config.is_long_read_filtering_enabled(),
×
659
                       totals.filtered_max_length });
660
  trimming.push_back({ step_id++,
×
661
                       "Filtering",
662
                       "Ambiguous bases",
663
                       "-",
664
                       config.is_ambiguous_base_filtering_enabled(),
×
665
                       totals.filtered_ambiguous });
666
  trimming.push_back({ step_id++,
×
667
                       "Filtering",
668
                       "Mean quality",
669
                       "-",
670
                       config.is_mean_quality_filtering_enabled(),
×
671
                       totals.filtered_mean_quality });
672
  trimming.push_back({ step_id++,
×
673
                       "Filtering",
674
                       "Low complexity reads",
675
                       "-",
676
                       config.is_low_complexity_filtering_enabled(),
×
677
                       totals.filtered_low_complexity });
678

679
  trimming.push_back(
×
680
    { step_id++, "Output", "Filtered reads", "-", true, total_output });
×
681

682
  write_html_trimming_stats(output, trimming, total_input);
×
683
}
684

685
void
686
write_html_section_title(const std::string& title, std::ostream& output)
×
687
{
688
  html_h2_tag().set_title(title).set_href(to_lower(title)).write(output);
×
689
}
690

691
void
692
write_html_io_section(const userconfig& config,
×
693
                      std::ostream& output,
694
                      const std::string& title,
695
                      fastq_stats_vec statistics,
696
                      string_vec names,
697
                      const fastq_stats_ptr& merged = fastq_stats_ptr())
698
{
699
  AR_REQUIRE(statistics.size() == names.size());
×
700

701
  write_html_section_title(title, output);
×
702

703
  const char* dynamic_width =
×
704
    config.paired_ended_mode || merged ? FACET_WIDTH_2 : FACET_WIDTH_1;
×
705

706
  html_plot_title()
×
707
    .set_href(to_lower(title) + "-position-qualities")
×
708
    .set_title("Position quality distribution")
×
709
    .write(output);
×
710
  html_facet_line_plot()
×
711
    .set_x_axis(config.is_read_merging_enabled() && merged ? "null"
×
712
                                                           : "Position"_json)
713
    .set_y_axis("Phred score"_json)
×
714
    .set_width(dynamic_width)
×
715
    .set_values(build_base_qualities(statistics, names))
×
716
    .write(output);
×
717

718
  if (config.is_read_merging_enabled() && merged) {
×
719
    html_facet_line_plot()
×
720
      .set_x_axis("Position"_json)
×
721
      .set_y_axis("Phred score"_json)
×
722
      .set_width(FIGURE_WIDTH)
×
723
      .set_values(build_base_qualities({ merged }, { "Merged" }))
×
724
      .write(output);
×
725
  }
726

727
  html_plot_title()
×
728
    .set_href(to_lower(title) + "-nucleotide-content")
×
729
    .set_title("Nucleotide content")
×
730
    .write(output);
×
731
  html_facet_line_plot()
×
732
    .set_x_axis(config.is_read_merging_enabled() && merged ? "null"
×
733
                                                           : "Position"_json)
734
    .set_y_axis("Frequency"_json)
×
735
    .set_width(dynamic_width)
×
736
    .set_values(build_base_content(statistics, names))
×
737
    .write(output);
×
738

739
  if (config.is_read_merging_enabled() && merged) {
×
740
    html_facet_line_plot()
×
741
      .set_x_axis("Position"_json)
×
742
      .set_y_axis("Frequency"_json)
×
743
      .set_width(FIGURE_WIDTH)
×
744
      .set_values(build_base_content({ merged }, { "Merged" }))
×
745
      .write(output);
×
746

747
    // Subsequent plots should include merged reads
748
    names.push_back("Merged");
×
749
    statistics.push_back(merged);
×
750
  }
751

752
  html_plot_title()
×
753
    .set_href(to_lower(title) + "-quality-scores")
×
754
    .set_title("Quality score distribution")
×
755
    .write(output);
×
756
  html_frequency_plot()
×
757
    .set_x_axis("Phred score"_json)
×
758
    .set_y_axis("Frequency"_json)
×
759
    .set_width(FIGURE_WIDTH)
×
760
    .set_values(build_quality_distribution(statistics, names))
×
761
    .write(output);
×
762

763
  {
×
764
    json_list data;
×
765

766
    for (size_t i = 0; i < statistics.size(); ++i) {
×
767
      const auto m = data.dict();
×
768
      m->str("group", names.at(i));
×
769
      m->i64("offset", 0);
×
770
      m->f64_vec("y", statistics.at(i)->gc_content());
×
771
    }
772

773
    html_plot_title()
×
774
      .set_href(to_lower(title) + "-gc-content")
×
775
      .set_title("GC Content")
×
776
      .write(output);
×
777
    html_frequency_plot()
×
778
      .set_x_axis("%GC"_json)
×
779
      .set_y_axis("Frequency"_json)
×
780
      .set_width(FIGURE_WIDTH)
×
781
      .set_values(data.to_string())
×
782
      .write(output);
×
783
  }
784
}
785

786
void
787
write_html_input_section(const userconfig& config,
×
788
                         const statistics& stats,
789
                         std::ostream& output)
790
{
791
  fastq_stats_vec stats_vec = { stats.input_1 };
×
792
  string_vec names = { "File 1" };
×
793

794
  if (config.paired_ended_mode) {
×
795
    stats_vec.push_back(stats.input_2);
×
796
    names.emplace_back("File 2");
×
797
  }
798

799
  write_html_io_section(
×
800
    config, output, "Input", std::move(stats_vec), std::move(names));
×
801
}
802

803
void
804
write_html_analyses_section(const userconfig& config,
×
805
                            const statistics& stats,
806
                            std::ostream& output)
807

808
{
809
  write_html_section_title("Analyses", output);
×
810

811
  // Insert size distribution
812
  if (config.paired_ended_mode) {
×
813
    counts insert_sizes;
×
814
    for (const auto& it : stats.trimming) {
×
815
      insert_sizes += it->insert_sizes;
×
816
    }
817

818
    json_list samples;
×
819
    const auto sample = samples.dict();
×
820
    sample->str("group", "insert_sizes");
×
821
    sample->i64("offset", 0);
×
822
    sample->i64_vec("y", insert_sizes);
×
823

824
    // FIXME: Specify "identified reads" when in demultiplexing mode and
825
    // correct format_percentage to merged / n_identified.
826
    std::ostringstream ss;
×
827
    ss << "Insert sizes inferred for "
×
828
       << format_percentage(insert_sizes.sum(),
×
829
                            stats.input_1->number_of_input_reads())
×
830
       << "% of reads";
×
831

832
    html_plot_title()
×
833
      .set_href("analyses-insert-sizes")
×
834
      .set_title("Insert-size distribution")
×
835
      .write(output);
×
836
    html_plot_sub_title().set_sub_title(ss.str()).write(output);
×
837
    html_frequency_plot()
×
838
      .set_x_axis("Insert size"_json)
×
839
      .set_y_axis("Frequency"_json)
×
840
      .set_legend("null")
×
841
      .set_width(FIGURE_WIDTH)
×
842
      .set_values(samples.to_string())
×
843
      .write(output);
×
844

845
    if (config.run_type == ar_command::report_only) {
×
846
      html_output_note()
×
847
        .set_text(
×
848
          "Insert size distribution inferred using adapter-free alignments.")
849
        .write(output);
×
850
    }
851
  }
852

853
  // Consensus adapter sequence inference
854
  if (config.paired_ended_mode && config.run_type == ar_command::report_only) {
×
855
    AR_REQUIRE(stats.adapter_id);
×
856

857
    const auto adapter_1 = stats.adapter_id->adapter1.summarize();
×
858
    const auto adapter_2 = stats.adapter_id->adapter2.summarize();
×
859

860
    // Consensus adapter sequences
861
    {
×
862
      const auto reference_adapters =
×
863
        config.samples.adapters().to_read_orientation().front();
×
864
      std::string reference_adapter_1{ reference_adapters.first };
×
865
      std::string reference_adapter_2{ reference_adapters.second };
×
866

867
      html_consensus_adapter_head()
×
868
        .set_overlapping_pairs(
×
869
          format_rough_number(stats.adapter_id->aligned_pairs))
×
870
        .set_pairs_with_adapters(
×
871
          format_rough_number(stats.adapter_id->pairs_with_adapters))
×
872
        .write(output);
×
873

874
      html_consensus_adapter_table()
×
875
        .set_name_1("--adapter1")
×
876
        .set_reference_1(reference_adapter_1)
×
877
        .set_alignment_1(adapter_1.compare_with(reference_adapter_1))
×
878
        .set_consensus_1(adapter_1.adapter().sequence())
×
879
        .set_qualities_1(adapter_1.adapter().qualities())
×
880
        .set_name_2("--adapter2")
×
881
        .set_reference_2(reference_adapter_2)
×
882
        .set_alignment_2(adapter_2.compare_with(reference_adapter_2))
×
883
        .set_consensus_2(adapter_2.adapter().sequence())
×
884
        .set_qualities_2(adapter_2.adapter().qualities())
×
885
        .write(output);
×
886
    }
887

888
    // Top N most common 5' kmers in adapter fragments
889
    {
×
890
      const auto& top_kmers_1 = adapter_1.top_kmers();
×
891
      const auto& top_kmers_2 = adapter_2.top_kmers();
×
892

893
      html_consensus_adapter_kmer_head()
×
894
        .set_n_kmers(std::to_string(consensus_adapter_stats::top_n_kmers))
×
895
        .set_kmer_length(std::to_string(consensus_adapter_stats::kmer_length))
×
896
        .write(output);
×
897

898
      const auto kmers = std::max(top_kmers_1.size(), top_kmers_2.size());
×
899
      for (size_t i = 0; i < kmers; ++i) {
×
900
        html_consensus_adapter_kmer_row row;
×
901
        row.set_index(std::to_string(i + 1));
×
902

903
        if (top_kmers_1.size() > i) {
×
904
          const auto& kmer = top_kmers_1.at(i);
×
905

906
          row.set_kmer_1(kmer.first)
×
907
            .set_count_1(format_rough_number(kmer.second))
×
908
            .set_pct_1(format_percentage(kmer.second, adapter_1.total_kmers()));
×
909
        } else {
910
          row.set_kmer_1("").set_count_1("").set_pct_1("");
×
911
        }
912

913
        if (top_kmers_2.size() > i) {
×
914
          const auto& kmer = top_kmers_2.at(i);
×
915

916
          row.set_kmer_2(kmer.first)
×
917
            .set_count_2(format_rough_number(kmer.second))
×
918
            .set_pct_2(format_percentage(kmer.second, adapter_2.total_kmers()));
×
919
        } else {
920
          row.set_kmer_2("").set_count_2("").set_pct_2("");
×
921
        }
922

923
        row.write(output);
×
924
      }
925

926
      html_consensus_adapter_kmer_tail().write(output);
×
927
    }
928
  }
929
}
930

931
std::pair<std::string, std::string>
932
join_barcodes(const sample& s)
×
933
{
934
  string_vec mate_1;
×
935
  string_vec mate_2;
×
936

937
  for (const auto& barcode : s) {
×
938
    mate_1.emplace_back(barcode.barcode_1);
×
939
    mate_2.emplace_back(barcode.barcode_2);
×
940
  }
941

942
  return {
×
943
    join_text(mate_1, "<br/>"),
×
944
    join_text(mate_2, "<br/>"),
×
945
  };
946
}
947

948
void
949
write_html_demultiplexing_section(const userconfig& config,
×
950
                                  const statistics& stats,
951
                                  std::ostream& output)
952

953
{
954
  write_html_section_title("Demultiplexing", output);
×
955

956
  json_list data;
×
957

958
  const size_t input_reads = stats.input_1->number_of_input_reads() +
×
959
                             stats.input_2->number_of_input_reads();
×
960

961
  for (size_t i = 0; i < config.samples.size(); ++i) {
×
962
    auto m = data.dict();
×
963
    m->str("x", config.samples.at(i).name());
×
964

965
    if (input_reads) {
×
966
      m->f64("y", (100.0 * stats.demultiplexing->samples.at(i)) / input_reads);
×
967
    } else {
968
      m->null("y");
×
969
    }
970
  }
971

972
  html_plot_title()
×
973
    .set_href("demux-samples")
×
974
    .set_title("Samples identified")
×
975
    .write(output);
×
976
  html_bar_plot()
×
977
    .set_x_axis("Samples"_json)
×
978
    .set_y_axis("Percent"_json)
×
979
    .set_width(FIGURE_WIDTH)
×
980
    .set_values(data.to_string())
×
981
    .write(output);
×
982

983
  html_demultiplexing_head().write(output);
×
984

985
  {
×
986
    const size_t unidentified = stats.demultiplexing->unidentified;
×
987

988
    fastq_statistics total;
×
989
    total += *stats.demultiplexing->unidentified_stats_1;
×
990
    total += *stats.demultiplexing->unidentified_stats_2;
×
991

992
    const auto output_reads = total.length_dist().sum();
×
993
    const auto output_bp = total.nucleotides_pos().sum();
×
994

995
    html_demultiplexing_row()
×
996
      .set_n("")
×
997
      .set_barcode_1("")
×
998
      .set_barcode_2("")
×
999
      .set_name("<b>Unidentified</b>")
×
1000
      .set_pct(format_percentage(unidentified, input_reads, 2))
×
1001
      .set_reads(format_rough_number(output_reads))
×
1002
      .set_bp(format_rough_number(output_bp))
×
1003
      .set_length(mean_of_bp_counts(total.length_dist()))
×
1004
      .set_gc(format_percentage(total.nucleotides_gc_pos().sum(), output_bp))
×
1005
      .write(output);
×
1006
  }
1007

1008
  size_t sample_idx = 0;
×
1009
  for (const auto& sample : config.samples) {
×
1010
    const auto& sample_stats = *stats.trimming.at(sample_idx);
×
1011

1012
    fastq_statistics total;
×
1013

1014
    total += *sample_stats.read_1;
×
1015
    total += *sample_stats.read_2;
×
1016
    total += *sample_stats.merged;
×
1017
    total += *sample_stats.singleton;
×
1018
    // Not included in overview:
1019
    // total += *sample.discarded;
1020

1021
    const auto output_reads = total.length_dist().sum();
×
1022
    const auto output_bp = total.nucleotides_pos().sum();
×
1023
    const auto barcodes = join_barcodes(sample);
×
1024

1025
    html_demultiplexing_row()
×
1026
      .set_n(std::to_string(sample_idx + 1))
×
1027
      .set_barcode_1(barcodes.first)
×
1028
      .set_barcode_2(barcodes.second)
×
1029
      .set_name(sample.name())
×
1030
      .set_pct(format_percentage(
×
1031
        stats.demultiplexing->samples.at(sample_idx), input_reads, 2))
×
1032
      .set_reads(format_rough_number(output_reads))
×
1033
      .set_bp(format_rough_number(output_bp))
×
1034
      .set_length(mean_of_bp_counts(total.length_dist()))
×
1035
      .set_gc(format_percentage(total.nucleotides_gc_pos().sum(), output_bp))
×
1036
      .write(output);
×
1037
  }
1038

1039
  html_demultiplexing_tail().write(output);
×
1040
}
1041

1042
void
1043
write_html_output_section(const userconfig& config,
×
1044
                          const statistics& stats,
1045
                          std::ostream& output)
1046

1047
{
1048
  fastq_stats_vec stats_vec;
×
1049
  string_vec names;
×
1050

1051
  auto merged = std::make_shared<fastq_statistics>();
×
1052

1053
  {
×
1054
    auto output_1 = std::make_shared<fastq_statistics>();
×
1055
    auto output_2 = std::make_shared<fastq_statistics>();
×
1056
    auto singleton = std::make_shared<fastq_statistics>();
×
1057
    auto discarded = std::make_shared<fastq_statistics>();
×
1058

1059
    for (const auto& it : stats.trimming) {
×
1060
      *output_1 += *it->read_1;
×
1061
      *output_2 += *it->read_2;
×
1062
      *merged += *it->merged;
×
1063
      *singleton += *it->singleton;
×
1064
      *discarded += *it->discarded;
×
1065
    }
1066

1067
    stats_vec.push_back(output_1);
×
1068
    names.emplace_back("Output 1");
×
1069

1070
    if (config.paired_ended_mode) {
×
1071
      stats_vec.push_back(output_2);
×
1072
      names.emplace_back("Output 2");
×
1073

1074
      if (config.is_any_filtering_enabled()) {
×
1075
        stats_vec.push_back(singleton);
×
1076
        names.emplace_back("Singleton");
×
1077
      }
1078
    }
1079

1080
    if (config.is_any_filtering_enabled()) {
×
1081
      stats_vec.push_back(discarded);
×
1082
      names.emplace_back("Discarded");
×
1083
    }
1084
  }
1085

1086
  write_html_io_section(
×
1087
    config, output, "Output", std::move(stats_vec), std::move(names), merged);
×
1088
}
1089

1090
} // namespace
1091

1092
////////////////////////////////////////////////////////////////////////////////
1093

1094
bool
1095
write_html_report(const userconfig& config,
×
1096
                  const statistics& stats,
1097
                  const std::string& filename)
1098
{
1099
  if (filename == DEV_NULL) {
×
1100
    // User disabled the report
1101
    return true;
1102
  }
1103

1104
  std::ostringstream output;
×
1105

1106
  write_html_summary_section(config, stats, output);
×
1107

1108
  if (config.run_type != ar_command::demultiplex_only &&
×
1109
      config.run_type != ar_command::report_only) {
1110
    write_html_processing_section(config, stats, output);
×
1111
  }
1112

1113
  write_html_input_section(config, stats, output);
×
1114

1115
  if (config.paired_ended_mode || config.run_type == ar_command::report_only) {
×
1116
    write_html_analyses_section(config, stats, output);
×
1117
  }
1118

1119
  if (config.is_demultiplexing_enabled()) {
×
1120
    write_html_demultiplexing_section(config, stats, output);
×
1121
  }
1122

1123
  if (config.run_type != ar_command::report_only) {
×
1124
    write_html_output_section(config, stats, output);
×
1125
  }
1126

1127
  html_body_end().write(output);
×
1128

1129
  try {
×
1130
    managed_writer writer{ filename };
×
1131
    writer.write(output.str());
×
1132
    writer.close();
×
1133
  } catch (const std::ios_base::failure& error) {
×
1134
    log::error() << "Error writing JSON report to '" << filename << "':\n"
×
1135
                 << indent_lines(error.what());
×
1136
    return false;
×
1137
  }
×
1138

1139
  return true;
×
1140
}
1141

1142
} // namespace adapterremoval
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc