• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #117

25 May 2025 03:01PM UTC coverage: 66.932% (-0.07%) from 67.006%
#117

push

travis-ci

web-flow
iwyu and reduce build-time inter-dependencies (#144)

26 of 145 new or added lines in 20 files covered. (17.93%)

89 existing lines in 5 files now uncovered.

9738 of 14549 relevant lines covered (66.93%)

3041.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/reports_html.cpp
1
// SPDX-License-Identifier: GPL-3.0-or-later
2
// SPDX-FileCopyrightText: 2022 Mikkel Schubert <mikkelsch@gmail.com>
3
#include "adapter_id.hpp"            // for adapter_id_statistics
4
#include "commontypes.hpp"           // for barcode_orientation, DEV_NULL
5
#include "counts.hpp"                // for counts, indexed_count, counts_tmpl
6
#include "debug.hpp"                 // for AR_REQUIRE
7
#include "errors.hpp"                // for io_error
8
#include "fastq.hpp"                 // for ACGT, ACGT::values, fastq, ACGTN
9
#include "fastq_enc.hpp"             // for ACGT, ACGTN
10
#include "json.hpp"                  // for json_dict, json_list, json_ptr
11
#include "logging.hpp"               // for log_stream, error
12
#include "main.hpp"                  // for VERSION, NAME
13
#include "managed_io.hpp"            // for managed_io
14
#include "reports.hpp"               // for write_html_report
15
#include "reports_template_html.hpp" // for html_frequency_plot, html_demultiple...
16
#include "sequence.hpp"              // for dna_sequence
17
#include "sequence_sets.hpp"         // for adapter_set
18
#include "statistics.hpp"            // for fastq_stats_ptr, fastq_statistics
19
#include "strutils.hpp"              // for format_percentage, format_rough...
20
#include "userconfig.hpp"            // for userconfig, ar_command, DEV_NULL
21
#include <algorithm>                 // for max
22
#include <array>                     // for array
23
#include <cctype>                    // for toupper
24
#include <cmath>                     // for fmod
25
#include <cstdint>                   // for uint64_t
26
#include <cstring>                   // for size_t, strerror
27
#include <iomanip>                   // for operator<<, setprecision, setw
28
#include <memory>                    // for __shared_ptr_access, shared_ptr
29
#include <sstream>                   // for ostringstream
30
#include <string>                    // for string, operator==, to_string
31
#include <string_view>               // for string_view
32
#include <utility>                   // for pair
33
#include <vector>                    // for vector
34

35
namespace adapterremoval {
36

37
namespace {
38

39
using fastq_stats_vec = std::vector<fastq_stats_ptr>;
40
using template_ptr = std::unique_ptr<html_template>;
41

42
//! Size chosen to allow fitting two pages side-by-side on a 1920 width display
43
const char* const FIGURE_WIDTH = "736";
44
//! Per figure width for two-column facet figures; approximate
45
const char* const FACET_WIDTH_2 = "351";
46
//! Per figure width for one-column facet figures; approximate
47
const char* const FACET_WIDTH_1 = FIGURE_WIDTH;
48

49
////////////////////////////////////////////////////////////////////////////////
50

51
/** Escapes a string that needs to be embedded in a JS */
52
std::string
53
json_encode(const std::string& s)
×
54
{
55
  return json_token::from_str(s)->to_string();
×
56
}
57

58
/** JSON escaped string */
59
std::string
60
operator""_json(const char* s, size_t length)
×
61
{
62
  return json_encode(std::string(s, length));
×
63
}
64

65
std::string
66
runtime_to_str(double seconds)
×
67
{
68
  std::ostringstream ss;
×
69

70
  if (seconds >= 3600.0) {
×
71
    ss << static_cast<size_t>(seconds / 3600.0) << " "
×
72
       << (seconds >= 7200.0 ? "hours, " : "hour, ") << std::setw(2);
×
73
  }
74

75
  if (seconds >= 60.0) {
×
76
    auto minutes = static_cast<size_t>(std::fmod(seconds, 3600.0) / 60.0);
×
77
    ss << minutes << " "
×
78
       << ((!minutes || minutes >= 120) ? "minutes" : "minute") << ", and "
×
79
       << std::setw(4);
×
80
  }
81

82
  ss << std::fixed << std::setprecision(1) << std::fmod(seconds, 60.0)
×
83
     << " seconds";
×
84

85
  return ss.str();
×
86
}
87

88
std::string
89
mean_of_bp_counts(const counts& count)
×
90
{
91
  auto reads = count.sum();
×
92
  auto bases = count.product();
×
93

94
  if (!reads) {
×
95
    return "NA";
×
96
  }
97

98
  if (bases % reads == 0) {
×
99
    return std::to_string(bases / reads) + " bp";
×
100
  }
101

102
  std::ostringstream ss;
×
103
  ss << std::fixed << std::setprecision(1)
×
104
     << (bases / static_cast<double>(reads)) << " bp";
×
105

106
  return ss.str();
×
107
}
108

109
/**
110
 * VEGA-lite will omit plots if there are no values; this function therefore
111
 * ensures that at least one value is written for a given measurement.
112
 */
113
template<typename T>
114
counts_tmpl<T>
115
require_values(counts_tmpl<T> r, T fallback = T())
×
116
{
117
  if (r.size()) {
×
118
    return r;
×
119
  }
120

121
  return counts_tmpl<T>({ fallback });
×
122
}
123

124
std::string
125
format_average_bases(const reads_and_bases& counts)
×
126
{
127
  const auto reads = counts.reads();
×
128

129
  if (reads) {
×
130
    return format_fraction(counts.bases(), reads, 1) + " bp";
×
131
  } else {
132
    return "NA";
×
133
  }
134
}
135

136
std::string
137
orientation_to_label(const sample_sequences& it)
×
138
{
139
  switch (it.orientation) {
×
140
    case barcode_orientation::unspecified:
×
141
      return {};
×
142
    case barcode_orientation::forward:
×
143
      return "+";
×
144
    case barcode_orientation::reverse:
×
145
      return "-";
×
146
    default:
×
147
      AR_FAIL("invalid barcode orientation");
×
148
  }
149
}
150

151
////////////////////////////////////////////////////////////////////////////////
152

153
class io_summary_writer
154
{
155
public:
156
  enum class io
157
  {
158
    input,
159
    output
160
  };
161

162
  io_summary_writer(std::ostream& output, const io type)
×
163
    : m_output(output)
×
164
    , m_type(type)
×
165

166
  {
167
  }
168

169
  void write_head(const std::string& title, const std::string& href)
×
170
  {
171
    html_summary_io_head().set_title(title).set_href(href).write(m_output);
×
172
  }
173

174
  void write_row(const std::string& title, const fastq_statistics& stats)
×
175
  {
176
    const auto n_reads = (m_type == io::input) ? stats.number_of_input_reads()
×
177
                                               : stats.number_of_output_reads();
×
178
    const auto total = stats.quality_dist().sum();
×
179

180
    html_summary_io_row()
×
181
      .set_name(title)
×
182
      .set_n_reads(format_rough_number(n_reads))
×
183
      .set_n_bases(format_rough_number(stats.length_dist().product()))
×
184
      .set_lengths(mean_of_bp_counts(stats.length_dist()))
×
185
      .set_q30(format_percentage(stats.quality_dist().sum(30), total))
×
186
      .set_q20(format_percentage(stats.quality_dist().sum(20), total))
×
187
      .set_ns(format_percentage(stats.nucleotides_pos('N').sum(), total))
×
188
      .set_gc(format_percentage(stats.nucleotides_gc_pos().sum(), total))
×
189
      .write(m_output);
×
190
  }
191

192
  void write_tail() { html_summary_io_tail().write(m_output); }
×
193

194
private:
195
  std::ostream& m_output;
196
  io m_type;
197
};
198

199
std::string
200
build_base_qualities(const fastq_stats_vec& reads, const string_vec& names)
×
201
{
202
  json_list qualities;
×
203

204
  for (size_t i = 0; i < reads.size(); ++i) {
×
205
    const auto& stats = *reads.at(i);
×
206

207
    auto total_quality = stats.qualities_pos();
×
208
    auto total_bases = stats.nucleotides_pos();
×
209

210
    for (const auto nucleotide : ACGT::values) {
×
211
      const auto nucleotides = stats.nucleotides_pos(nucleotide);
×
212
      const auto quality = stats.qualities_pos(nucleotide);
×
213

214
      auto dict = qualities.dict();
×
215
      dict->str("read", names.at(i));
×
216
      dict->i64("offset", 1);
×
217
      dict->str("group", std::string(1, ::toupper(nucleotide)));
×
218
      dict->f64_vec("y", quality / nucleotides);
×
219
    }
220

221
    auto dict = qualities.dict();
×
222
    dict->str("read", names.at(i));
×
223
    dict->i64("offset", 1);
×
224
    dict->str("group", "Mean");
×
225

226
    // Ensure that values get written, to prevent the plot being omitted
227
    dict->f64_vec("y", require_values(total_quality / total_bases));
×
228
  }
229

230
  return qualities.to_string();
×
231
}
232

233
std::string
234
build_quality_distribution(const fastq_stats_vec& reads,
×
235
                           const string_vec& names)
236
{
237
  json_list data;
×
238

239
  for (size_t i = 0; i < reads.size(); ++i) {
×
240
    const auto& stats = reads.at(i);
×
241
    auto count = stats->quality_dist().trim();
×
242
    // A max that should give a uniform look to most data
243
    count.resize_up_to(44);
×
244

245
    const auto m = data.dict();
×
246
    m->str("group", names.at(i));
×
247
    m->i64("offset", 0);
×
248
    m->i64_vec("y", count);
×
249
  }
250

251
  return data.to_string();
×
252
}
253

254
std::string
255
build_base_content(const fastq_stats_vec& reads, const string_vec& names)
×
256
{
257
  json_list content;
×
258

259
  for (size_t i = 0; i < reads.size(); ++i) {
×
260
    const auto& stats = *reads.at(i);
×
261

262
    auto total_bases = stats.nucleotides_pos();
×
263

264
    for (const auto nucleotide : ACGTN::values) {
×
265
      const auto bases = stats.nucleotides_pos(nucleotide);
×
266

267
      const auto dict = content.dict();
×
268
      dict->str("read", names.at(i));
×
269
      dict->i64("offset", 1);
×
270
      dict->str("group", std::string(1, nucleotide));
×
271

272
      // Ensure that values get written, to prevent the plot being omitted
273
      dict->f64_vec("y", require_values(bases / total_bases));
×
274
    }
275

276
    {
×
277
      const auto gc_content = stats.nucleotides_gc_pos();
×
278
      auto dict = content.dict();
×
279
      dict->str("read", names.at(i));
×
280
      dict->i64("offset", 1);
×
281
      dict->str("group", "GC");
×
282

283
      // Ensure that values get written, to prevent the plot being omitted
284
      dict->f64_vec("y", require_values(gc_content / total_bases));
×
285
    }
286
  }
287

288
  return content.to_string();
×
289
}
290

291
////////////////////////////////////////////////////////////////////////////////
292
// Main sections
293

294
void
295
write_html_sampling_note(const userconfig& config,
×
296
                         const std::string& label,
297
                         const fastq_statistics& stats,
298
                         std::ostream& output)
299
{
300
  if (config.report_sample_rate < 1.0) {
×
301
    html_sampling_note()
×
302
      .set_label(label)
×
303
      .set_reads(format_rough_number((stats.number_of_sampled_reads())))
×
304
      .set_pct(format_percentage(stats.number_of_sampled_reads(),
×
305
                                 stats.number_of_input_reads()))
×
306
      .write(output);
×
307
  }
308
}
309

310
void
311
write_html_summary_section(const userconfig& config,
×
312
                           const statistics& stats,
313
                           std::ostream& output)
314
{
315
  html_head().set_title(config.report_title).write(output);
×
316

317
  html_body_start().set_title(config.report_title).write(output);
×
318

319
  // Basic information about the executable / call
320
  {
×
321
    html_summary()
×
322
      .set_date_and_time(userconfig::start_time)
×
323
      .set_version(VERSION)
×
324
      .set_command(shell_escape_command(config.args))
×
325
      .set_runtime(runtime_to_str(config.runtime()))
×
326
      .write(output);
×
327
  }
328

329
  fastq_statistics output_1;
×
330
  fastq_statistics output_2;
×
331
  fastq_statistics merged;
×
332
  fastq_statistics singleton;
×
333
  fastq_statistics discarded;
×
334

335
  for (const auto& it : stats.trimming) {
×
336
    output_1 += *it->read_1;
×
337
    output_2 += *it->read_2;
×
338
    merged += *it->merged;
×
339
    singleton += *it->singleton;
×
340
    discarded += *it->discarded;
×
341
  }
342

343
  if (config.paired_ended_mode) {
×
344
    // Summary statistics for input files
345
    {
×
346
      fastq_statistics totals;
×
347
      totals += *stats.input_1;
×
348
      totals += *stats.input_2;
×
349

350
      io_summary_writer summary(output, io_summary_writer::io::input);
×
351
      summary.write_head("Input", "summary-input");
×
352
      if (config.paired_ended_mode) {
×
353
        summary.write_row("Summary", totals);
×
354
        summary.write_row("File 1", *stats.input_1);
×
355
        summary.write_row("File 2", *stats.input_2);
×
356
      }
357
      summary.write_tail();
×
358

359
      write_html_sampling_note(config, "input", totals, output);
×
360
    }
361

362
    // Summary statistics for output files
363
    if (config.run_type != ar_command::report_only) {
×
364
      fastq_statistics totals;
×
365
      totals += output_1;
×
366
      totals += output_2;
×
367
      totals += merged;
×
368
      totals += singleton;
×
369
      // discarded reads not counted in the output
370
      // totals += discarded;
371

372
      io_summary_writer summary{ output, io_summary_writer::io::output };
×
373
      summary.write_head("Output", "summary-output");
×
374
      summary.write_row("Passed*", totals);
×
375
      if (config.paired_ended_mode) {
×
376
        summary.write_row("File 1", output_1);
×
377
        summary.write_row("File 2", output_2);
×
378

379
        if (config.is_read_merging_enabled()) {
×
380
          summary.write_row("Merged", merged);
×
381
        }
382

383
        if (config.is_any_filtering_enabled()) {
×
384
          summary.write_row("Singleton", singleton);
×
385
        }
386
      }
387

388
      if (config.is_any_filtering_enabled()) {
×
389
        summary.write_row("Discarded*", discarded);
×
390
      }
391
      summary.write_tail();
×
392

393
      write_html_sampling_note(config, "output", totals, output);
×
394

395
      // Note regarding passed / discarded reads
396
      html_output_footnote()
×
397
        .set_symbol("*")
×
398
        .set_html("The <b>Passed</b> column includes all read types except "
×
399
                  "for <b>Discarded</b> reads.")
400
        .write(output);
×
401
    }
402
  } else if (config.run_type == ar_command::report_only) {
×
403
    io_summary_writer summary{ output, io_summary_writer::io::input };
×
404
    summary.write_head("Input summary", "summary-input");
×
405
    summary.write_row("Input", *stats.input_1);
×
406
    summary.write_tail();
×
407

408
    write_html_sampling_note(config, "input", *stats.input_1, output);
×
409
  }
410

411
  else {
412
    io_summary_writer summary{ output, io_summary_writer::io::input };
×
413
    summary.write_head("Input/Output summary", "summary-input-output");
×
414
    summary.write_row("Input", *stats.input_1);
×
415
    summary.write_row("Output", output_1);
×
416
    if (config.is_any_filtering_enabled()) {
×
417
      summary.write_row("Discarded*", discarded);
×
418
    }
419
    summary.write_tail();
×
420

421
    fastq_statistics totals;
×
422
    totals += *stats.input_1;
×
423
    totals += output_1;
×
424

425
    write_html_sampling_note(config, "input/output", totals, output);
×
426

427
    if (config.is_any_filtering_enabled()) {
×
428
      // Note regarding discarded reads in output
429
      html_output_footnote()
×
430
        .set_symbol("*")
×
431
        .set_html("<b>Discarded</b> reads are not included in the "
×
432
                  "<b>Output</b> column.")
433
        .write(output);
×
434
    }
435
  }
436
}
437

438
//! Trimming statistics
439
struct trimming_stats
440
{
441
  size_t id;
442
  //! Processing stage relative to adapter trimming (pre, X, post)
443
  std::string stage;
444
  //! Row label 1 (step)
445
  std::string label_1;
446
  //! Row label 1 (sub-step)
447
  std::string label_2;
448
  //! Whether or not this step is enabled by command-line options
449
  bool enabled;
450
  //! Number of reads/bases trimmed/filtered
451
  reads_and_bases count;
452
};
453

454
void
455
write_html_trimming_stats(std::ostream& output,
×
456
                          const std::vector<trimming_stats>& stats,
457
                          const reads_and_bases& totals)
458
{
459
  size_t n_processing_steps = 0;
×
460
  size_t n_processing_steps_on = 0;
×
461
  size_t n_filtering_steps = 0;
×
462
  size_t n_filtering_steps_on = 0;
×
463

464
  size_t last_id = -1;
×
465
  size_t last_enabled = -1;
×
466
  for (const auto& it : stats) {
×
467
    if (it.id != last_id) {
×
468
      if (it.stage == "Processing") {
×
469
        n_processing_steps++;
×
470
      } else if (it.stage == "Filtering") {
×
471
        n_filtering_steps++;
×
472
      }
473

474
      last_id = it.id;
×
475
    }
476

477
    if (it.enabled && it.id != last_enabled) {
×
478
      if (it.stage == "Processing") {
×
479
        n_processing_steps_on++;
×
480
      } else if (it.stage == "Filtering") {
×
481
        n_filtering_steps_on++;
×
482
      }
483

484
      last_enabled = it.id;
×
485
    }
486
  }
487

488
  html_summary_trimming_head().write(output);
×
489

490
  std::string previous_stage;
×
491
  std::string previous_label_1;
×
492

493
  for (const auto& it : stats) {
×
494
    if (it.enabled) {
×
495
      const auto label_1 = it.label_1 == previous_label_1 ? "" : it.label_1;
×
496
      const auto stage = it.stage == previous_stage ? "" : it.stage;
×
497

498
      previous_stage = it.stage;
×
499
      previous_label_1 = it.label_1;
×
500

501
      html_summary_trimming_row()
×
502
        .set_stage(stage)
×
503
        .set_label_1(label_1)
×
504
        .set_label_2(it.label_2)
×
505
        .set_reads(format_rough_number(it.count.reads()))
×
506
        .set_pct_reads(format_percentage(it.count.reads(), totals.reads()))
×
507
        .set_bases(format_rough_number(it.count.bases()))
×
508
        .set_pct_bases(format_percentage(it.count.bases(), totals.bases()))
×
509
        .set_avg_bases(format_average_bases(it.count))
×
510
        .write(output);
×
511
    }
512
  }
513

514
  html_summary_trimming_tail()
×
515
    .set_n_enabled_filt(std::to_string(n_filtering_steps_on))
×
516
    .set_n_total_filt(std::to_string(n_filtering_steps))
×
517
    .set_n_enabled_proc(std::to_string(n_processing_steps_on))
×
518
    .set_n_total_proc(std::to_string(n_processing_steps))
×
519
    .write(output);
×
520
}
521

522
//! Filtering statistics
523
struct filtering_stats
524
{
525
  //! Filtering step
526
  std::string label;
527
  //! Whether or not this step is enabled by command-line options
528
  bool enabled;
529
  //! Number of reads/bases trimmed/filtered
530
  reads_and_bases count;
531
};
532

533
reads_and_bases
534
summarize_input(const fastq_stats_ptr& ptr)
×
535
{
536
  const auto n_bases = ptr->length_dist().product();
×
537
  AR_REQUIRE(n_bases >= 0);
×
538

539
  return reads_and_bases{ ptr->number_of_input_reads(),
×
540
                          static_cast<uint64_t>(n_bases) };
541
}
542

543
void
544
build_polyx_trimming_rows(std::vector<trimming_stats>& out,
×
545
                          const std::string& polyx_nucleotides,
546
                          const indexed_count<ACGT>& reads,
547
                          const indexed_count<ACGT>& bases,
548
                          const size_t id)
549
{
550
  for (const auto nucleotide : ACGT::values) {
×
551
    out.push_back(
×
552
      { id,
553
        "Processing",
554
        "Poly-X tails",
555
        std::string(1, nucleotide),
556
        polyx_nucleotides.find(nucleotide) != std::string::npos,
×
557
        reads_and_bases(reads.get(nucleotide), bases.get(nucleotide)) });
×
558
  }
559

560
  out.push_back({ id,
×
561
                  "Processing",
562
                  "Poly-X tails",
563
                  "*",
564
                  polyx_nucleotides.size() > 1,
×
565
                  reads_and_bases(reads.sum(), bases.sum()) });
×
566
}
567

568
void
569
write_html_processing_section(const userconfig& config,
×
570
                              const statistics& stats,
571
                              std::ostream& output)
572
{
573
  trimming_statistics totals;
×
574
  for (const auto& it : stats.trimming) {
×
575
    totals += *it;
×
576
  }
577

578
  uint64_t adapter_reads = 0;
×
579
  uint64_t adapter_bases = 0;
×
580

NEW
581
  for (size_t i = 0; i < config.samples->adapters().size(); ++i) {
×
582
    adapter_reads += totals.adapter_trimmed_reads.get(i);
×
583
    adapter_bases += totals.adapter_trimmed_bases.get(i);
×
584
  }
585

586
  const auto total_input =
×
587
    summarize_input(stats.input_1) + summarize_input(stats.input_2);
×
588

589
  reads_and_bases total_output;
×
590
  for (const auto& it : stats.trimming) {
×
591
    total_output += summarize_input(it->read_1);
×
592
    total_output += summarize_input(it->read_2);
×
593
    total_output += summarize_input(it->singleton);
×
594
    total_output += summarize_input(it->merged);
×
595
  }
596

597
  // Trimming steps prior to adapter trimming
598
  size_t step_id = 0;
×
599
  std::vector<trimming_stats> trimming = {
×
600
    { step_id++, "Input", "Raw reads", "-", true, total_input },
×
601
    { step_id++,
×
602
      "Processing",
603
      "Terminal bases",
604
      "-",
605
      config.is_terminal_base_pre_trimming_enabled(),
×
606
      totals.terminal_pre_trimmed },
607
  };
608

609
  build_polyx_trimming_rows(trimming,
×
610
                            config.pre_trim_poly_x,
×
611
                            totals.poly_x_pre_trimmed_reads,
612
                            totals.poly_x_pre_trimmed_bases,
613
                            step_id++);
614

615
  trimming.push_back({ step_id++,
×
616
                       "Processing",
617
                       "Adapters",
618
                       "-",
619
                       config.is_adapter_trimming_enabled(),
×
620
                       reads_and_bases(adapter_reads, adapter_bases) });
621

622
  trimming.push_back({ step_id++,
×
623
                       "Processing",
624
                       "Merging",
625
                       "-",
626
                       config.is_read_merging_enabled(),
×
627
                       totals.reads_merged });
628

629
  trimming.push_back({ step_id++,
×
630
                       "Processing",
631
                       "Terminal bases",
632
                       "-",
633
                       config.is_terminal_base_post_trimming_enabled(),
×
634
                       totals.terminal_post_trimmed });
635

636
  build_polyx_trimming_rows(trimming,
×
637
                            config.post_trim_poly_x,
×
638
                            totals.poly_x_post_trimmed_reads,
639
                            totals.poly_x_post_trimmed_bases,
640
                            step_id++);
641

642
  trimming.push_back({ step_id++,
×
643
                       "Processing",
644
                       "Low quality bases",
645
                       "-",
646
                       config.is_low_quality_trimming_enabled(),
×
647
                       totals.low_quality_trimmed });
648

649
  trimming.push_back({ step_id++,
×
650
                       "Filtering",
651
                       "Short reads",
652
                       "-",
653
                       config.is_short_read_filtering_enabled(),
×
654
                       totals.filtered_min_length });
655

656
  trimming.push_back({ step_id++,
×
657
                       "Filtering",
658
                       "Long reads",
659
                       "-",
660
                       config.is_long_read_filtering_enabled(),
×
661
                       totals.filtered_max_length });
662
  trimming.push_back({ step_id++,
×
663
                       "Filtering",
664
                       "Ambiguous bases",
665
                       "-",
666
                       config.is_ambiguous_base_filtering_enabled(),
×
667
                       totals.filtered_ambiguous });
668
  trimming.push_back({ step_id++,
×
669
                       "Filtering",
670
                       "Mean quality",
671
                       "-",
672
                       config.is_mean_quality_filtering_enabled(),
×
673
                       totals.filtered_mean_quality });
674
  trimming.push_back({ step_id++,
×
675
                       "Filtering",
676
                       "Low complexity reads",
677
                       "-",
678
                       config.is_low_complexity_filtering_enabled(),
×
679
                       totals.filtered_low_complexity });
680

681
  trimming.push_back(
×
682
    { step_id++, "Output", "Filtered reads", "-", true, total_output });
×
683

684
  write_html_trimming_stats(output, trimming, total_input);
×
685
}
686

687
void
688
write_html_section_title(const std::string& title, std::ostream& output)
×
689
{
690
  html_h2_tag().set_title(title).set_href(to_lower(title)).write(output);
×
691
}
692

693
void
694
write_html_io_section(const userconfig& config,
×
695
                      std::ostream& output,
696
                      const std::string& title,
697
                      fastq_stats_vec statistics,
698
                      string_vec names,
699
                      const fastq_stats_ptr& merged = fastq_stats_ptr())
700
{
701
  AR_REQUIRE(statistics.size() == names.size());
×
702

703
  write_html_section_title(title, output);
×
704

705
  const char* dynamic_width =
×
706
    config.paired_ended_mode || merged ? FACET_WIDTH_2 : FACET_WIDTH_1;
×
707

708
  html_plot_title()
×
709
    .set_href(to_lower(title) + "-position-qualities")
×
710
    .set_title("Position quality distribution")
×
711
    .write(output);
×
712
  html_facet_line_plot()
×
713
    .set_x_axis(config.is_read_merging_enabled() && merged ? "null"
×
714
                                                           : "Position"_json)
715
    .set_y_axis("Phred score"_json)
×
716
    .set_width(dynamic_width)
×
717
    .set_values(build_base_qualities(statistics, names))
×
718
    .write(output);
×
719

720
  if (config.is_read_merging_enabled() && merged) {
×
721
    html_facet_line_plot()
×
722
      .set_x_axis("Position"_json)
×
723
      .set_y_axis("Phred score"_json)
×
724
      .set_width(FIGURE_WIDTH)
×
725
      .set_values(build_base_qualities({ merged }, { "Merged" }))
×
726
      .write(output);
×
727
  }
728

729
  html_plot_title()
×
730
    .set_href(to_lower(title) + "-nucleotide-content")
×
731
    .set_title("Nucleotide content")
×
732
    .write(output);
×
733
  html_facet_line_plot()
×
734
    .set_x_axis(config.is_read_merging_enabled() && merged ? "null"
×
735
                                                           : "Position"_json)
736
    .set_y_axis("Frequency"_json)
×
737
    .set_width(dynamic_width)
×
738
    .set_values(build_base_content(statistics, names))
×
739
    .write(output);
×
740

741
  if (config.is_read_merging_enabled() && merged) {
×
742
    html_facet_line_plot()
×
743
      .set_x_axis("Position"_json)
×
744
      .set_y_axis("Frequency"_json)
×
745
      .set_width(FIGURE_WIDTH)
×
746
      .set_values(build_base_content({ merged }, { "Merged" }))
×
747
      .write(output);
×
748

749
    // Subsequent plots should include merged reads
750
    names.push_back("Merged");
×
751
    statistics.push_back(merged);
×
752
  }
753

754
  html_plot_title()
×
755
    .set_href(to_lower(title) + "-quality-scores")
×
756
    .set_title("Quality score distribution")
×
757
    .write(output);
×
758
  html_frequency_plot()
×
759
    .set_x_axis("Phred score"_json)
×
760
    .set_y_axis("Frequency"_json)
×
761
    .set_width(FIGURE_WIDTH)
×
762
    .set_values(build_quality_distribution(statistics, names))
×
763
    .write(output);
×
764

765
  {
×
766
    json_list data;
×
767

768
    for (size_t i = 0; i < statistics.size(); ++i) {
×
769
      const auto m = data.dict();
×
770
      m->str("group", names.at(i));
×
771
      m->i64("offset", 0);
×
772
      m->f64_vec("y", statistics.at(i)->gc_content());
×
773
    }
774

775
    html_plot_title()
×
776
      .set_href(to_lower(title) + "-gc-content")
×
777
      .set_title("GC Content")
×
778
      .write(output);
×
779
    html_frequency_plot()
×
780
      .set_x_axis("%GC"_json)
×
781
      .set_y_axis("Frequency"_json)
×
782
      .set_width(FIGURE_WIDTH)
×
783
      .set_values(data.to_string())
×
784
      .write(output);
×
785
  }
786
}
787

788
void
789
write_html_input_section(const userconfig& config,
×
790
                         const statistics& stats,
791
                         std::ostream& output)
792
{
793
  fastq_stats_vec stats_vec = { stats.input_1 };
×
794
  string_vec names = { "File 1" };
×
795

796
  if (config.paired_ended_mode) {
×
797
    stats_vec.push_back(stats.input_2);
×
798
    names.emplace_back("File 2");
×
799
  }
800

801
  write_html_io_section(config,
×
802
                        output,
803
                        "Input",
804
                        std::move(stats_vec),
805
                        std::move(names));
806
}
807

808
void
809
write_html_analyses_section(const userconfig& config,
×
810
                            const statistics& stats,
811
                            std::ostream& output)
812

813
{
814
  write_html_section_title("Analyses", output);
×
815

816
  // Insert size distribution
817
  if (config.paired_ended_mode) {
×
818
    counts insert_sizes;
×
819
    for (const auto& it : stats.trimming) {
×
820
      insert_sizes += it->insert_sizes;
×
821
    }
822

823
    json_list samples;
×
824
    const auto sample = samples.dict();
×
825
    sample->str("group", "insert_sizes");
×
826
    sample->i64("offset", 0);
×
827
    sample->i64_vec("y", insert_sizes);
×
828

829
    // FIXME: Specify "identified reads" when in demultiplexing mode and
830
    // correct format_percentage to merged / n_identified.
831
    std::ostringstream ss;
×
832
    ss << "Insert sizes inferred for "
×
833
       << format_percentage(insert_sizes.sum(),
×
834
                            stats.input_1->number_of_input_reads())
×
835
       << " of reads";
×
836

837
    html_plot_title()
×
838
      .set_href("analyses-insert-sizes")
×
839
      .set_title("Insert-size distribution")
×
840
      .write(output);
×
841
    html_plot_sub_title().set_sub_title(ss.str()).write(output);
×
842
    html_frequency_plot()
×
843
      .set_x_axis("Insert size"_json)
×
844
      .set_y_axis("Frequency"_json)
×
845
      .set_legend("null")
×
846
      .set_width(FIGURE_WIDTH)
×
847
      .set_values(samples.to_string())
×
848
      .write(output);
×
849

850
    if (config.run_type == ar_command::report_only) {
×
851
      html_output_note()
×
852
        .set_text(
×
853
          "Insert size distribution inferred using adapter-free alignments.")
854
        .write(output);
×
855
    }
856
  }
857

858
  if (config.report_duplication) {
×
859
    AR_REQUIRE(stats.duplication_1 && stats.duplication_2);
×
860
    const auto dupes_1 = stats.duplication_1->summarize();
×
861
    const auto dupes_2 = stats.duplication_2->summarize();
×
862
    const auto mean_uniq_frac = (dupes_1.unique_frac + dupes_2.unique_frac) / 2;
×
863

864
    const auto to_percent = [](double value) {
×
865
      std::ostringstream os;
×
866
      os << std::fixed << std::setprecision(1) << (value * 100.0) << " %";
×
867
      return os.str();
×
868
    };
869

870
    html_duplication_head().write(output);
×
871
    if (config.paired_ended_mode) {
×
872
      html_duplication_body_pe()
×
873
        .set_pct_unique(to_percent(mean_uniq_frac))
×
874
        .set_pct_unique_1(to_percent(dupes_1.unique_frac))
×
875
        .set_pct_unique_2(to_percent(dupes_2.unique_frac))
×
876
        .write(output);
×
877
    } else {
878
      html_duplication_body_se()
×
879
        .set_pct_unique(to_percent(dupes_1.unique_frac))
×
880
        .write(output);
×
881
    }
882

883
    const auto add_line = [](json_list& list,
×
884
                             std::string_view read,
885
                             std::string_view group,
886
                             const std::vector<std::string>& labels,
887
                             const rates& values) {
888
      AR_REQUIRE(labels.size() == values.size());
×
889
      for (size_t i = 0; i < labels.size(); ++i) {
×
890
        auto dict = list.dict();
×
891
        dict->str("read", read);
×
892
        dict->str("group", group);
×
893
        dict->str("x", labels.at(i));
×
894
        dict->f64("y", values.get(i));
×
895
      }
896
    };
897

898
    json_list data;
×
899
    const auto add_lines = [add_line, &data](const decltype(dupes_1)& s,
×
900
                                             std::string_view label) {
901
      add_line(data, label, "All", s.labels, s.total_sequences);
×
902
      add_line(data, label, "Unique", s.labels, s.unique_sequences);
×
903
    };
904

905
    add_lines(dupes_1, "File 1");
×
906
    if (config.paired_ended_mode) {
×
907
      add_lines(dupes_2, "File 2");
×
908
    }
909

910
    html_duplication_plot()
×
911
      .set_width(config.paired_ended_mode ? FACET_WIDTH_2 : FACET_WIDTH_1)
×
912
      .set_values(data.to_string())
×
913
      .write(output);
×
914
  }
915

916
  // Consensus adapter sequence inference
917
  if (config.paired_ended_mode && config.run_type == ar_command::report_only) {
×
918
    AR_REQUIRE(stats.adapter_id);
×
919

920
    const auto adapter_1 = stats.adapter_id->adapter1.summarize();
×
921
    const auto adapter_2 = stats.adapter_id->adapter2.summarize();
×
922

923
    // Consensus adapter sequences
924
    {
×
925
      const auto reference_adapters =
×
NEW
926
        config.samples->adapters().to_read_orientation().front();
×
927
      std::string reference_adapter_1{ reference_adapters.first };
×
928
      std::string reference_adapter_2{ reference_adapters.second };
×
929

930
      html_consensus_adapter_head()
×
931
        .set_overlapping_pairs(
×
932
          format_rough_number(stats.adapter_id->aligned_pairs))
×
933
        .set_pairs_with_adapters(
×
934
          format_rough_number(stats.adapter_id->pairs_with_adapters))
×
935
        .write(output);
×
936

937
      html_consensus_adapter_table()
×
938
        .set_name_1("--adapter1")
×
939
        .set_reference_1(reference_adapter_1)
×
940
        .set_alignment_1(adapter_1.compare_with(reference_adapter_1))
×
941
        .set_consensus_1(adapter_1.adapter().sequence())
×
942
        .set_qualities_1(adapter_1.adapter().qualities())
×
943
        .set_name_2("--adapter2")
×
944
        .set_reference_2(reference_adapter_2)
×
945
        .set_alignment_2(adapter_2.compare_with(reference_adapter_2))
×
946
        .set_consensus_2(adapter_2.adapter().sequence())
×
947
        .set_qualities_2(adapter_2.adapter().qualities())
×
948
        .write(output);
×
949
    }
950

951
    // Top N most common 5' kmers in adapter fragments
952
    {
×
953
      const auto& top_kmers_1 = adapter_1.top_kmers();
×
954
      const auto& top_kmers_2 = adapter_2.top_kmers();
×
955

956
      html_consensus_adapter_kmer_head()
×
957
        .set_n_kmers(std::to_string(consensus_adapter_stats::top_n_kmers))
×
958
        .set_kmer_length(std::to_string(consensus_adapter_stats::kmer_length))
×
959
        .write(output);
×
960

961
      const auto kmers = std::max(top_kmers_1.size(), top_kmers_2.size());
×
962
      for (size_t i = 0; i < kmers; ++i) {
×
963
        html_consensus_adapter_kmer_row row;
×
964
        row.set_index(std::to_string(i + 1));
×
965

966
        if (top_kmers_1.size() > i) {
×
967
          const auto& kmer = top_kmers_1.at(i);
×
968

969
          row.set_kmer_1(kmer.first)
×
970
            .set_count_1(format_rough_number(kmer.second))
×
971
            .set_pct_1(format_percentage(kmer.second, adapter_1.total_kmers()));
×
972
        }
973

974
        if (top_kmers_2.size() > i) {
×
975
          const auto& kmer = top_kmers_2.at(i);
×
976

977
          row.set_kmer_2(kmer.first)
×
978
            .set_count_2(format_rough_number(kmer.second))
×
979
            .set_pct_2(format_percentage(kmer.second, adapter_2.total_kmers()));
×
980
        }
981

982
        row.write(output);
×
983
      }
984

985
      html_consensus_adapter_kmer_tail().write(output);
×
986
    }
987
  }
988
}
989

990
void
991
write_html_demultiplexing_barplot(const userconfig& config,
×
992
                                  const statistics& stats,
993
                                  std::ostream& output)
994
{
995
  json_list data;
×
996

997
  const size_t input_reads = stats.input_1->number_of_input_reads() +
×
998
                             stats.input_2->number_of_input_reads();
×
999

NEW
1000
  for (size_t i = 0; i < config.samples->size(); ++i) {
×
NEW
1001
    const auto& sample = config.samples->at(i);
×
1002

1003
    for (size_t j = 0; j < sample.size(); ++j) {
×
1004
      auto count = stats.demultiplexing->samples.at(i).get(j);
×
1005

1006
      const auto& sequences = sample.at(j);
×
1007
      std::string key{ sequences.barcode_1 };
×
1008
      if (!sequences.barcode_2.empty()) {
×
1009
        key.push_back('-');
×
1010
        key.append(sequences.barcode_2);
×
1011
      }
1012

1013
      auto m = data.dict();
×
1014
      m->i64("n", j + 1);
×
1015
      m->str("barcodes", key);
×
1016

1017
      if (sequences.orientation != barcode_orientation::unspecified) {
×
1018
        m->str("orientation", orientation_to_label(sequences));
×
1019
      }
1020

1021
      m->str("sample", sample.name());
×
1022

1023
      if (input_reads) {
×
1024
        m->f64("pct", (100.0 * count) / input_reads);
×
1025
      } else {
1026
        m->null("pct");
×
1027
      }
1028
    }
1029
  }
1030

1031
  html_plot_title()
×
1032
    .set_href("demux-samples")
×
1033
    .set_title("Samples identified")
×
1034
    .write(output);
×
1035
  html_bar_plot()
×
1036
    .set_x_axis("Samples"_json)
×
1037
    .set_y_axis("Percent"_json)
×
1038
    .set_width(FIGURE_WIDTH)
×
1039
    .set_values(data.to_string())
×
1040
    .write(output);
×
1041
}
1042

1043
void
1044
write_html_demultiplexing_table(const userconfig& config,
×
1045
                                const statistics& stats,
1046
                                std::ostream& output,
1047
                                const bool multiple_barcodes,
1048
                                const bool mixed_orientation)
1049
{
1050
  const size_t input_reads = stats.input_1->number_of_input_reads() +
×
1051
                             stats.input_2->number_of_input_reads();
×
1052

1053
  html_demultiplexing_table_head()
×
1054
    .set_orientation(mixed_orientation ? "<th></th>" : "")
×
1055
    .write(output);
×
1056

1057
  {
×
1058
    const size_t unidentified = stats.demultiplexing->unidentified;
×
1059

1060
    fastq_statistics total;
×
1061
    total += *stats.demultiplexing->unidentified_stats_1;
×
1062
    total += *stats.demultiplexing->unidentified_stats_2;
×
1063

1064
    const auto output_reads = total.length_dist().sum();
×
1065
    const auto output_bp = total.nucleotides_pos().sum();
×
1066

1067
    html_demultiplexing_row()
×
1068
      .set_name("<b>Unidentified</b>")
×
1069
      .set_sample_pct(format_percentage(unidentified, input_reads, 2))
×
1070
      .set_reads(format_rough_number(output_reads))
×
1071
      .set_bp(format_rough_number(output_bp))
×
1072
      .set_length(mean_of_bp_counts(total.length_dist()))
×
1073
      .set_gc(format_percentage(total.nucleotides_gc_pos().sum(), output_bp))
×
1074
      .set_orientation(mixed_orientation ? "<td></td>" : "")
×
1075
      .write(output);
×
1076
  }
1077

1078
  size_t sample_idx = 0;
×
NEW
1079
  for (const auto& sample : *config.samples) {
×
1080
    const auto& output_stats = *stats.trimming.at(sample_idx);
×
1081
    const auto& barcode_counts = stats.demultiplexing->samples.at(sample_idx);
×
1082
    const auto sample_reads = barcode_counts.sum();
×
1083

1084
    fastq_statistics total;
×
1085

1086
    total += *output_stats.read_1;
×
1087
    total += *output_stats.read_2;
×
1088
    total += *output_stats.merged;
×
1089
    total += *output_stats.singleton;
×
1090
    // Not included in overview:
1091
    // total += *sample.discarded;
1092

1093
    const auto output_reads = total.length_dist().sum();
×
1094
    const auto output_bp = total.nucleotides_pos().sum();
×
1095

1096
    html_demultiplexing_row row;
×
1097
    if (sample.size() < 2) {
×
1098
      const auto& it = sample.at(0);
×
1099
      row.set_barcode_1(std::string{ it.barcode_1 })
×
1100
        .set_barcode_2(std::string{ it.barcode_2 });
×
1101

1102
      if (mixed_orientation) {
×
1103
        row.set_orientation("<td>" + orientation_to_label(it) + "</td>");
×
1104
      }
1105
    } else {
1106
      const auto cell = "<i>" + std::to_string(sample.size()) + " barcodes</i>";
×
1107
      row.set_barcode_1(cell).set_barcode_2(cell);
×
1108

1109
      if (mixed_orientation) {
×
1110
        row.set_orientation("<td></td>");
×
1111
      }
1112
    }
1113

1114
    row.set_n(std::to_string(sample_idx + 1))
×
1115
      .set_name(sample.name())
×
1116
      .set_sample_pct(format_percentage(sample_reads, input_reads, 2))
×
1117
      .set_reads(format_rough_number(output_reads))
×
1118
      .set_bp(format_rough_number(output_bp))
×
1119
      .set_length(mean_of_bp_counts(total.length_dist()))
×
1120
      .set_gc(format_percentage(total.nucleotides_gc_pos().sum(), output_bp))
×
1121
      .write(output);
×
1122

1123
    if (sample.size() > 1) {
×
1124
      const auto total_count = barcode_counts.sum();
×
1125

1126
      for (size_t j = 0; j < sample.size(); j++) {
×
1127
        const auto& it = sample.at(j);
×
1128
        const auto count = barcode_counts.get(j);
×
1129

1130
        html_demultiplexing_barcode_row barcode_row;
×
1131
        barcode_row.set_barcode_1(std::string{ it.barcode_1 })
×
1132
          .set_barcode_2(std::string{ it.barcode_2 })
×
1133
          .set_barcode_pct_row(format_percentage(count, total_count, 2));
×
1134

1135
        if (mixed_orientation) {
×
1136
          barcode_row.set_orientation("<td>" + orientation_to_label(it) +
×
1137
                                      "</td>");
1138
        }
1139

1140
        barcode_row.write(output);
×
1141
      }
1142
    }
1143

1144
    ++sample_idx;
×
1145
  }
1146

1147
  html_demultiplexing_table_tail().write(output);
×
1148

1149
  if (multiple_barcodes || mixed_orientation) {
×
1150
    html_demultiplexing_toggle().write(output);
×
1151
  }
1152
}
1153

1154
void
1155
write_html_demultiplexing_section(const userconfig& config,
×
1156
                                  const statistics& stats,
1157
                                  std::ostream& output)
1158

1159
{
1160
  bool multiple_barcodes = false;
×
1161
  bool mixed_orientation = false;
×
NEW
1162
  for (const auto& sample : *config.samples) {
×
1163
    multiple_barcodes |= sample.size() > 1;
×
1164
    for (const auto& it : sample) {
×
1165
      mixed_orientation |= it.orientation != barcode_orientation::unspecified;
×
1166
    }
1167
  }
1168

1169
  write_html_section_title("Demultiplexing", output);
×
1170
  html_demultiplexing_head().write(output);
×
1171
  write_html_demultiplexing_barplot(config, stats, output);
×
1172
  write_html_demultiplexing_table(config,
×
1173
                                  stats,
1174
                                  output,
1175
                                  multiple_barcodes,
1176
                                  mixed_orientation);
1177
}
1178

1179
void
1180
write_html_output_section(const userconfig& config,
×
1181
                          const statistics& stats,
1182
                          std::ostream& output)
1183

1184
{
1185
  fastq_stats_vec stats_vec;
×
1186
  string_vec names;
×
1187

1188
  auto merged = std::make_shared<fastq_statistics>();
×
1189

1190
  {
×
1191
    auto output_1 = std::make_shared<fastq_statistics>();
×
1192
    auto output_2 = std::make_shared<fastq_statistics>();
×
1193
    auto singleton = std::make_shared<fastq_statistics>();
×
1194
    auto discarded = std::make_shared<fastq_statistics>();
×
1195

1196
    for (const auto& it : stats.trimming) {
×
1197
      *output_1 += *it->read_1;
×
1198
      *output_2 += *it->read_2;
×
1199
      *merged += *it->merged;
×
1200
      *singleton += *it->singleton;
×
1201
      *discarded += *it->discarded;
×
1202
    }
1203

1204
    stats_vec.push_back(output_1);
×
1205
    names.emplace_back("Output 1");
×
1206

1207
    if (config.paired_ended_mode) {
×
1208
      stats_vec.push_back(output_2);
×
1209
      names.emplace_back("Output 2");
×
1210

1211
      if (config.is_any_filtering_enabled()) {
×
1212
        stats_vec.push_back(singleton);
×
1213
        names.emplace_back("Singleton");
×
1214
      }
1215
    }
1216

1217
    if (config.is_any_filtering_enabled()) {
×
1218
      stats_vec.push_back(discarded);
×
1219
      names.emplace_back("Discarded");
×
1220
    }
1221
  }
1222

1223
  write_html_io_section(config,
×
1224
                        output,
1225
                        "Output",
1226
                        std::move(stats_vec),
1227
                        std::move(names),
1228
                        merged);
1229
}
1230

1231
} // namespace
1232

1233
////////////////////////////////////////////////////////////////////////////////
1234

1235
bool
1236
write_html_report(const userconfig& config,
×
1237
                  const statistics& stats,
1238
                  const std::string& filename)
1239
{
1240
  if (filename == DEV_NULL) {
×
1241
    // User disabled the report
1242
    return true;
1243
  }
1244

1245
  std::ostringstream output;
×
1246

1247
  write_html_summary_section(config, stats, output);
×
1248

1249
  if (config.run_type != ar_command::demultiplex_only &&
×
1250
      config.run_type != ar_command::report_only) {
1251
    write_html_processing_section(config, stats, output);
×
1252
  }
1253

1254
  write_html_input_section(config, stats, output);
×
1255

1256
  if (config.paired_ended_mode || config.report_duplication ||
×
1257
      config.run_type == ar_command::report_only) {
×
1258
    write_html_analyses_section(config, stats, output);
×
1259
  }
1260

1261
  if (config.is_demultiplexing_enabled()) {
×
1262
    write_html_demultiplexing_section(config, stats, output);
×
1263
  }
1264

1265
  if (config.run_type != ar_command::report_only) {
×
1266
    write_html_output_section(config, stats, output);
×
1267
  }
1268

1269
  html_body_end().write(output);
×
1270

1271
  try {
×
1272
    managed_writer writer{ filename };
×
1273
    writer.write(output.str());
×
1274
    writer.close();
×
1275
  } catch (const io_error& error) {
×
1276
    log::error() << "Error writing JSON report to '" << filename << "':\n"
×
1277
                 << indent_lines(error.what());
×
1278
    return false;
×
1279
  }
×
1280

1281
  return true;
×
1282
}
1283

1284
} // namespace adapterremoval
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc