• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MikkelSchubert / adapterremoval / #45

20 Sep 2024 06:49PM UTC coverage: 26.244% (-49.2%) from 75.443%
#45

push

travis-ci

web-flow
attempt to fix coveralls run

2458 of 9366 relevant lines covered (26.24%)

4362.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/reports_html.cpp
1
/*************************************************************************\
2
 * AdapterRemoval - cleaning next-generation sequencing reads            *
3
 *                                                                       *
4
 * Copyright (C) 2022 by Mikkel Schubert - mikkelsch@gmail.com           *
5
 *                                                                       *
6
 * This program is free software: you can redistribute it and/or modify  *
7
 * it under the terms of the GNU General Public License as published by  *
8
 * the Free Software Foundation, either version 3 of the License, or     *
9
 * (at your option) any later version.                                   *
10
 *                                                                       *
11
 * This program is distributed in the hope that it will be useful,       *
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
14
 * GNU General Public License for more details.                          *
15
 *                                                                       *
16
 * You should have received a copy of the GNU General Public License     *
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. *
18
\*************************************************************************/
19
#include "adapter_id.hpp"            // for adapter_id_statistics
20
#include "counts.hpp"                // for counts, indexed_count, counts_tmpl
21
#include "debug.hpp"                 // for AR_REQUIRE
22
#include "fastq.hpp"                 // for ACGT, ACGT::values, fastq, ACGTN
23
#include "json.hpp"                  // for json_dict, json_list, json_ptr
24
#include "logging.hpp"               // for log_stream, error
25
#include "main.hpp"                  // for VERSION, NAME
26
#include "managed_io.hpp"            // for managed_io
27
#include "output.hpp"                // for DEV_NULL, output_files
28
#include "reports.hpp"               // for write_html_report
29
#include "reports_template_html.hpp" // for html_frequency_plot, html_demultiple...
30
#include "sequence_sets.hpp"         // for adapter_set
31
#include "simd.hpp"                  // for size_t
32
#include "statistics.hpp"            // for fastq_stats_ptr, fastq_statistics
33
#include "strutils.hpp"              // for format_percentage, format_rough...
34
#include "userconfig.hpp"            // for userconfig, ar_command, DEV_NULL
35
#include <algorithm>                 // for max
36
#include <cctype>                    // for toupper
37
#include <cerrno>                    // for errno
38
#include <cmath>                     // for fmod
39
#include <cstdint>                   // for uint64_t
40
#include <cstring>                   // for size_t, strerror
41
#include <iomanip>                   // for operator<<, setprecision, setw
42
#include <memory>                    // for __shared_ptr_access, shared_ptr
43
#include <sstream>                   // for ostringstream
44
#include <string>                    // for string, operator==, to_string
45
#include <utility>                   // for pair
46
#include <vector>                    // for vector
47

48
namespace adapterremoval {
49

50
namespace {
51

52
using fastq_stats_vec = std::vector<fastq_stats_ptr>;
53
using template_ptr = std::unique_ptr<html_template>;
54

55
//! Size chosen to allow fitting two pages side-by-side on a 1920 width display
56
const char* const FIGURE_WIDTH = "736";
57
//! Per figure width for two-column facet figures; approximate
58
const char* const FACET_WIDTH_2 = "351";
59
//! Per figure width for one-column facet figures; approximate
60
const char* const FACET_WIDTH_1 = FIGURE_WIDTH;
61

62
////////////////////////////////////////////////////////////////////////////////
63

64
/** Escapes a string that needs to be embedded in a JS */
65
std::string
66
json_encode(const std::string& s)
×
67
{
68
  return json_token::from_str(s)->to_string();
×
69
}
70

71
/** JSON escaped string */
72
std::string
73
operator""_json(const char* s, size_t length)
×
74
{
75
  return json_encode(std::string(s, length));
×
76
}
77

78
std::string
79
runtime_to_str(double seconds)
×
80
{
81
  std::ostringstream ss;
×
82

83
  if (seconds >= 3600.0) {
×
84
    ss << static_cast<size_t>(seconds / 3600.0) << " "
×
85
       << (seconds >= 7200.0 ? "hours, " : "hour, ") << std::setw(2);
×
86
  }
87

88
  if (seconds >= 60.0) {
×
89
    auto minutes = static_cast<size_t>(std::fmod(seconds, 3600.0) / 60.0);
×
90
    ss << minutes << " "
×
91
       << ((!minutes || minutes >= 120) ? "minutes" : "minute") << ", and "
×
92
       << std::setw(4);
×
93
  }
94

95
  ss << std::fixed << std::setprecision(1) << std::fmod(seconds, 60.0)
×
96
     << " seconds";
×
97

98
  return ss.str();
×
99
}
100

101
std::string
102
mean_of_bp_counts(const counts& count)
×
103
{
104
  auto reads = count.sum();
×
105
  auto bases = count.product();
×
106

107
  if (!reads) {
×
108
    return "NA";
×
109
  }
110

111
  if (bases % reads == 0) {
×
112
    return std::to_string(bases / reads) + " bp";
×
113
  }
114

115
  std::ostringstream ss;
×
116
  ss << std::fixed << std::setprecision(1)
×
117
     << (bases / static_cast<double>(reads)) << " bp";
×
118

119
  return ss.str();
×
120
}
121

122
/**
123
 * VEGA-lite will omit plots if there are no values; this function therefore
124
 * ensures that at least one value is written for a given measurement.
125
 */
126
template<typename T>
127
counts_tmpl<T>
128
require_values(counts_tmpl<T> r, T fallback = T())
×
129
{
130
  if (r.size()) {
×
131
    return r;
×
132
  }
133

134
  return counts_tmpl<T>({ fallback });
×
135
}
136

137
std::string
138
format_average_bases(const reads_and_bases& counts)
×
139
{
140
  const auto reads = counts.reads();
×
141

142
  if (reads) {
×
143
    return format_fraction(counts.bases(), reads, 1) + " bp";
×
144
  } else {
145
    return "NA";
×
146
  }
147
}
148

149
////////////////////////////////////////////////////////////////////////////////
150

151
class io_summary_writer
×
152
{
153
public:
154
  enum class io
155
  {
156
    input,
157
    output
158
  };
159

160
  io_summary_writer(const std::string& title, io type)
×
161
    : m_type(type)
×
162
  {
163
    m_writer.set_title(title);
×
164
  }
165

166
  void set_href(const std::string& value) { m_writer.set_href(value); }
×
167

168
  void write(std::ostream& output) { m_writer.write(output); }
×
169

170
  void add_column(const std::string& title, const fastq_statistics& stats)
×
171
  {
172
    m_writer.add_columns(title);
×
173

174
    const auto n_reads = (m_type == io::input) ? stats.number_of_input_reads()
×
175
                                               : stats.number_of_output_reads();
×
176
    m_writer.add_n_reads(format_rough_number(n_reads));
×
177

178
    m_writer.add_n_bases(format_rough_number(stats.length_dist().product()));
×
179

180
    m_writer.add_lengths(mean_of_bp_counts(stats.length_dist()));
×
181

182
    auto total = stats.quality_dist().sum();
×
183
    m_writer.add_q30(format_percentage(stats.quality_dist().sum(30), total));
×
184
    m_writer.add_q20(format_percentage(stats.quality_dist().sum(20), total));
×
185
    m_writer.add_ns(format_percentage(stats.nucleotides_pos('N').sum(), total));
×
186
    m_writer.add_gc(format_percentage(stats.nucleotides_gc_pos().sum(), total));
×
187
  }
188

189
private:
190
  html_summary_io m_writer{};
191
  io m_type;
192
};
193

194
std::string
195
build_base_qualities(const fastq_stats_vec& reads, const string_vec& names)
×
196
{
197
  json_list qualities;
×
198

199
  for (size_t i = 0; i < reads.size(); ++i) {
×
200
    const auto& stats = *reads.at(i);
×
201

202
    auto total_quality = stats.qualities_pos();
×
203
    auto total_bases = stats.nucleotides_pos();
×
204

205
    for (const auto nucleotide : ACGT::values) {
×
206
      const auto nucleotides = stats.nucleotides_pos(nucleotide);
×
207
      const auto quality = stats.qualities_pos(nucleotide);
×
208

209
      auto dict = qualities.dict();
×
210
      dict->str("read", names.at(i));
×
211
      dict->i64("offset", 1);
×
212
      dict->str("group", std::string(1, ::toupper(nucleotide)));
×
213
      dict->f64_vec("y", quality / nucleotides);
×
214
    }
215

216
    auto dict = qualities.dict();
×
217
    dict->str("read", names.at(i));
×
218
    dict->i64("offset", 1);
×
219
    dict->str("group", "Mean");
×
220

221
    // Ensure that values get written, to prevent the plot being omitted
222
    dict->f64_vec("y", require_values(total_quality / total_bases));
×
223
  }
224

225
  return qualities.to_string();
×
226
}
227

228
std::string
229
build_quality_distribution(const fastq_stats_vec& reads,
×
230
                           const string_vec& names)
231
{
232
  json_list data;
×
233

234
  for (size_t i = 0; i < reads.size(); ++i) {
×
235
    const auto& stats = reads.at(i);
×
236
    auto count = stats->quality_dist().trim();
×
237
    // A max that should give a uniform look to most data
238
    count.resize_up_to(44);
×
239

240
    const auto m = data.dict();
×
241
    m->str("group", names.at(i));
×
242
    m->i64("offset", 0);
×
243
    m->i64_vec("y", count);
×
244
  }
245

246
  return data.to_string();
×
247
}
248

249
std::string
250
build_base_content(const fastq_stats_vec& reads, const string_vec& names)
×
251
{
252
  json_list content;
×
253

254
  for (size_t i = 0; i < reads.size(); ++i) {
×
255
    const auto& stats = *reads.at(i);
×
256

257
    auto total_bases = stats.nucleotides_pos();
×
258

259
    for (const auto nucleotide : ACGTN::values) {
×
260
      const auto bases = stats.nucleotides_pos(nucleotide);
×
261

262
      const auto dict = content.dict();
×
263
      dict->str("read", names.at(i));
×
264
      dict->i64("offset", 1);
×
265
      dict->str("group", std::string(1, nucleotide));
×
266

267
      // Ensure that values get written, to prevent the plot being omitted
268
      dict->f64_vec("y", require_values(bases / total_bases));
×
269
    }
270

271
    {
×
272
      const auto gc_content = stats.nucleotides_gc_pos();
×
273
      auto dict = content.dict();
×
274
      dict->str("read", names.at(i));
×
275
      dict->i64("offset", 1);
×
276
      dict->str("group", "GC");
×
277

278
      // Ensure that values get written, to prevent the plot being omitted
279
      dict->f64_vec("y", require_values(gc_content / total_bases));
×
280
    }
281
  }
282

283
  return content.to_string();
×
284
}
285

286
////////////////////////////////////////////////////////////////////////////////
287
// Main sections
288

289
void
290
write_html_sampling_note(const userconfig& config,
×
291
                         const std::string& label,
292
                         const fastq_statistics& stats,
293
                         std::ostream& output)
294
{
295
  if (config.report_sample_rate < 1.0) {
×
296
    html_sampling_note()
×
297
      .set_label(label)
×
298
      .set_reads(format_rough_number((stats.number_of_sampled_reads())))
×
299
      .set_pct(format_percentage(stats.number_of_sampled_reads(),
×
300
                                 stats.number_of_input_reads()))
×
301
      .write(output);
×
302
  }
303
}
304

305
void
306
write_html_summary_section(const userconfig& config,
×
307
                           const statistics& stats,
308
                           std::ostream& output)
309
{
310
  html_head().set_name(NAME).set_version(VERSION).write(output);
×
311

312
  html_body_start().write(output);
×
313

314
  // Basic information about the executable / call
315
  {
×
316
    html_summary()
×
317
      .set_date_and_time(userconfig::start_time)
×
318
      .set_version(VERSION)
×
319
      .set_command(shell_escape_command(config.args))
×
320
      .set_runtime(runtime_to_str(config.runtime()))
×
321
      .write(output);
×
322
  }
323

324
  fastq_statistics output_1;
×
325
  fastq_statistics output_2;
×
326
  fastq_statistics merged;
×
327
  fastq_statistics singleton;
×
328
  fastq_statistics discarded;
×
329

330
  for (const auto& it : stats.trimming) {
×
331
    output_1 += *it->read_1;
×
332
    output_2 += *it->read_2;
×
333
    merged += *it->merged;
×
334
    singleton += *it->singleton;
×
335
    discarded += *it->discarded;
×
336
  }
337

338
  if (config.paired_ended_mode) {
×
339
    // Summary statistics for input files
340
    {
×
341
      fastq_statistics totals;
×
342
      totals += *stats.input_1;
×
343
      totals += *stats.input_2;
×
344

345
      io_summary_writer summary("Input", io_summary_writer::io::input);
×
346

347
      summary.set_href("summary-input");
×
348

349
      if (config.paired_ended_mode) {
×
350
        summary.add_column("Summary", totals);
×
351
        summary.add_column("File 1", *stats.input_1);
×
352
        summary.add_column("File 2", *stats.input_2);
×
353
      }
354

355
      summary.write(output);
×
356

357
      write_html_sampling_note(config, "input", totals, output);
×
358
    }
359

360
    // Summary statistics for output files
361
    if (config.run_type != ar_command::report_only) {
×
362
      fastq_statistics totals;
×
363
      totals += output_1;
×
364
      totals += output_2;
×
365
      totals += merged;
×
366
      totals += singleton;
×
367
      // discarded reads not counted in the output
368
      // totals += discarded;
369

370
      io_summary_writer summary("Output", io_summary_writer::io::output);
×
371

372
      summary.set_href("summary-output");
×
373
      summary.add_column("Passed*", totals);
×
374

375
      if (config.paired_ended_mode) {
×
376
        summary.add_column("File 1", output_1);
×
377
        summary.add_column("File 2", output_2);
×
378

379
        if (config.is_read_merging_enabled()) {
×
380
          summary.add_column("Merged", merged);
×
381
        }
382

383
        if (config.is_any_filtering_enabled()) {
×
384
          summary.add_column("Singleton", singleton);
×
385
        }
386
      }
387

388
      if (config.is_any_filtering_enabled()) {
×
389
        summary.add_column("Discarded*", discarded);
×
390
      }
391

392
      summary.write(output);
×
393

394
      write_html_sampling_note(config, "output", totals, output);
×
395

396
      // Note regarding passed / discarded reads
397
      html_output_footnote()
×
398
        .set_symbol("*")
×
399
        .set_text("The <b>Passed</b> column includes all read types except for "
×
400
                  "<b>Discarded</b> reads.")
401
        .write(output);
×
402
    }
403
  } else if (config.run_type == ar_command::report_only) {
×
404
    io_summary_writer summary("Input summary", io_summary_writer::io::input);
×
405

406
    summary.set_href("summary-input");
×
407
    summary.add_column("Input", *stats.input_1);
×
408
    summary.write(output);
×
409

410
    write_html_sampling_note(config, "input", *stats.input_1, output);
×
411
  } else {
×
412
    io_summary_writer summary("Input/Output summary",
×
413
                              io_summary_writer::io::input);
×
414

415
    summary.set_href("summary-input-output");
×
416
    summary.add_column("Input", *stats.input_1);
×
417
    summary.add_column("Output", output_1);
×
418
    if (config.is_any_filtering_enabled()) {
×
419
      summary.add_column("Discarded*", discarded);
×
420
    }
421
    summary.write(output);
×
422

423
    fastq_statistics totals;
×
424
    totals += *stats.input_1;
×
425
    totals += output_1;
×
426

427
    write_html_sampling_note(config, "input/output", totals, output);
×
428

429
    if (config.is_any_filtering_enabled()) {
×
430
      // Note regarding discarded reads in output
431
      html_output_footnote()
×
432
        .set_symbol("*")
×
433
        .set_text("<b>Discarded</b> reads are not included in the "
×
434
                  "<b>Output</b> column.")
435
        .write(output);
×
436
    }
437
  }
438
}
439

440
//! Trimming statistics
441
struct trimming_stats
442
{
443
  size_t id;
444
  //! Processing stage relative to adapter trimming (pre, X, post)
445
  std::string stage;
446
  //! Row label 1 (step)
447
  std::string label_1;
448
  //! Row label 1 (sub-step)
449
  std::string label_2;
450
  //! Whether or not this step is enabled by command-line options
451
  bool enabled;
452
  //! Number of reads/bases trimmed/filtered
453
  reads_and_bases count;
454
};
455

456
void
457
write_html_trimming_stats(std::ostream& output,
×
458
                          const std::vector<trimming_stats>& stats,
459
                          const reads_and_bases& totals)
460
{
461
  size_t n_processing_steps = 0;
×
462
  size_t n_processing_steps_on = 0;
×
463
  size_t n_filtering_steps = 0;
×
464
  size_t n_filtering_steps_on = 0;
×
465

466
  size_t last_id = -1;
×
467
  size_t last_enabled = -1;
×
468
  for (const auto& it : stats) {
×
469
    if (it.id != last_id) {
×
470
      if (it.stage == "Processing") {
×
471
        n_processing_steps++;
×
472
      } else if (it.stage == "Filtering") {
×
473
        n_filtering_steps++;
×
474
      }
475

476
      last_id = it.id;
×
477
    }
478

479
    if (it.enabled && it.id != last_enabled) {
×
480
      if (it.stage == "Processing") {
×
481
        n_processing_steps_on++;
×
482
      } else if (it.stage == "Filtering") {
×
483
        n_filtering_steps_on++;
×
484
      }
485

486
      last_enabled = it.id;
×
487
    }
488
  }
489

490
  html_summary_trimming_head().write(output);
×
491

492
  std::string previous_stage;
×
493
  std::string previous_label_1;
×
494

495
  for (const auto& it : stats) {
×
496
    if (it.enabled) {
×
497
      const auto label_1 = it.label_1 == previous_label_1 ? "" : it.label_1;
×
498
      const auto stage = it.stage == previous_stage ? "" : it.stage;
×
499

500
      previous_stage = it.stage;
×
501
      previous_label_1 = it.label_1;
×
502

503
      html_summary_trimming_row()
×
504
        .set_stage(stage)
×
505
        .set_label_1(label_1)
×
506
        .set_label_2(it.label_2)
×
507
        .set_reads(format_rough_number(it.count.reads()))
×
508
        .set_pct_reads(format_percentage(it.count.reads(), totals.reads()))
×
509
        .set_bases(format_rough_number(it.count.bases()))
×
510
        .set_pct_bases(format_percentage(it.count.bases(), totals.bases()))
×
511
        .set_avg_bases(format_average_bases(it.count))
×
512
        .write(output);
×
513
    }
514
  }
515

516
  html_summary_trimming_tail()
×
517
    .set_n_enabled_filt(std::to_string(n_filtering_steps_on))
×
518
    .set_n_total_filt(std::to_string(n_filtering_steps))
×
519
    .set_n_enabled_proc(std::to_string(n_processing_steps_on))
×
520
    .set_n_total_proc(std::to_string(n_processing_steps))
×
521
    .write(output);
×
522
}
523

524
//! Filtering statistics
525
struct filtering_stats
526
{
527
  //! Filtering step
528
  std::string label;
529
  //! Whether or not this step is enabled by command-line options
530
  bool enabled;
531
  //! Number of reads/bases trimmed/filtered
532
  reads_and_bases count;
533
};
534

535
reads_and_bases
536
summarize_input(const fastq_stats_ptr& ptr)
×
537
{
538
  const auto n_bases = ptr->length_dist().product();
×
539
  AR_REQUIRE(n_bases >= 0);
×
540

541
  return reads_and_bases{ ptr->number_of_input_reads(),
×
542
                          static_cast<uint64_t>(n_bases) };
543
}
544

545
void
546
build_polyx_trimming_rows(std::vector<trimming_stats>& out,
×
547
                          const std::string& polyx_nucleotides,
548
                          const indexed_count<ACGT>& reads,
549
                          const indexed_count<ACGT>& bases,
550
                          const size_t id)
551
{
552
  for (const auto nucleotide : ACGT::values) {
×
553
    out.push_back(
×
554
      { id,
555
        "Processing",
556
        "Poly-X tails",
557
        std::string(1, nucleotide),
558
        polyx_nucleotides.find(nucleotide) != std::string::npos,
×
559
        reads_and_bases(reads.get(nucleotide), bases.get(nucleotide)) });
×
560
  }
561

562
  out.push_back({ id,
×
563
                  "Processing",
564
                  "Poly-X tails",
565
                  "*",
566
                  polyx_nucleotides.size() > 1,
×
567
                  reads_and_bases(reads.sum(), bases.sum()) });
×
568
}
569

570
void
571
write_html_processing_section(const userconfig& config,
×
572
                              const statistics& stats,
573
                              std::ostream& output)
574
{
575
  trimming_statistics totals;
×
576
  for (const auto& it : stats.trimming) {
×
577
    totals += *it;
×
578
  }
579

580
  uint64_t adapter_reads = 0;
×
581
  uint64_t adapter_bases = 0;
×
582

583
  for (size_t i = 0; i < config.samples.adapters().size(); ++i) {
×
584
    adapter_reads += totals.adapter_trimmed_reads.get(i);
×
585
    adapter_bases += totals.adapter_trimmed_bases.get(i);
×
586
  }
587

588
  const auto total_input =
×
589
    summarize_input(stats.input_1) + summarize_input(stats.input_2);
×
590

591
  reads_and_bases total_output;
×
592
  for (const auto& it : stats.trimming) {
×
593
    total_output += summarize_input(it->read_1);
×
594
    total_output += summarize_input(it->read_2);
×
595
    total_output += summarize_input(it->singleton);
×
596
    total_output += summarize_input(it->merged);
×
597
  }
598

599
  // Trimming steps prior to adapter trimming
600
  size_t step_id = 0;
×
601
  std::vector<trimming_stats> trimming = {
×
602
    { step_id++, "Input", "Raw reads", "-", true, total_input },
×
603
    { step_id++,
×
604
      "Processing",
605
      "Terminal bases",
606
      "-",
607
      config.is_terminal_base_pre_trimming_enabled(),
×
608
      totals.terminal_pre_trimmed },
609
  };
610

611
  build_polyx_trimming_rows(trimming,
×
612
                            config.pre_trim_poly_x,
×
613
                            totals.poly_x_pre_trimmed_reads,
614
                            totals.poly_x_pre_trimmed_bases,
615
                            step_id++);
616

617
  trimming.push_back({ step_id++,
×
618
                       "Processing",
619
                       "Adapters",
620
                       "-",
621
                       config.is_adapter_trimming_enabled(),
×
622
                       reads_and_bases(adapter_reads, adapter_bases) });
623

624
  trimming.push_back({ step_id++,
×
625
                       "Processing",
626
                       "Merging",
627
                       "-",
628
                       config.is_read_merging_enabled(),
×
629
                       totals.reads_merged });
630

631
  trimming.push_back({ step_id++,
×
632
                       "Processing",
633
                       "Terminal bases",
634
                       "-",
635
                       config.is_terminal_base_post_trimming_enabled(),
×
636
                       totals.terminal_post_trimmed });
637

638
  build_polyx_trimming_rows(trimming,
×
639
                            config.post_trim_poly_x,
×
640
                            totals.poly_x_post_trimmed_reads,
641
                            totals.poly_x_post_trimmed_bases,
642
                            step_id++);
643

644
  trimming.push_back({ step_id++,
×
645
                       "Processing",
646
                       "Low quality bases",
647
                       "-",
648
                       config.is_low_quality_trimming_enabled(),
×
649
                       totals.low_quality_trimmed });
650

651
  trimming.push_back({ step_id++,
×
652
                       "Filtering",
653
                       "Short reads",
654
                       "-",
655
                       config.is_short_read_filtering_enabled(),
×
656
                       totals.filtered_min_length });
657

658
  trimming.push_back({ step_id++,
×
659
                       "Filtering",
660
                       "Long reads",
661
                       "-",
662
                       config.is_long_read_filtering_enabled(),
×
663
                       totals.filtered_max_length });
664
  trimming.push_back({ step_id++,
×
665
                       "Filtering",
666
                       "Ambiguous bases",
667
                       "-",
668
                       config.is_ambiguous_base_filtering_enabled(),
×
669
                       totals.filtered_ambiguous });
670
  trimming.push_back({ step_id++,
×
671
                       "Filtering",
672
                       "Mean quality",
673
                       "-",
674
                       config.is_mean_quality_filtering_enabled(),
×
675
                       totals.filtered_mean_quality });
676
  trimming.push_back({ step_id++,
×
677
                       "Filtering",
678
                       "Low complexity reads",
679
                       "-",
680
                       config.is_low_complexity_filtering_enabled(),
×
681
                       totals.filtered_low_complexity });
682

683
  trimming.push_back(
×
684
    { step_id++, "Output", "Filtered reads", "-", true, total_output });
×
685

686
  write_html_trimming_stats(output, trimming, total_input);
×
687
}
688

689
void
690
write_html_section_title(const std::string& title, std::ostream& output)
×
691
{
692
  html_h2_tag().set_title(title).set_href(to_lower(title)).write(output);
×
693
}
694

695
void
696
write_html_io_section(const userconfig& config,
×
697
                      std::ostream& output,
698
                      const std::string& title,
699
                      fastq_stats_vec statistics,
700
                      string_vec names,
701
                      const fastq_stats_ptr& merged = fastq_stats_ptr())
702
{
703
  AR_REQUIRE(statistics.size() == names.size());
×
704

705
  write_html_section_title(title, output);
×
706

707
  const char* dynamic_width =
×
708
    config.paired_ended_mode || merged ? FACET_WIDTH_2 : FACET_WIDTH_1;
×
709

710
  html_plot_title()
×
711
    .set_href(to_lower(title) + "-position-qualities")
×
712
    .set_title("Position quality distribution")
×
713
    .write(output);
×
714
  html_facet_line_plot()
×
715
    .set_x_axis(config.is_read_merging_enabled() && merged ? "null"
×
716
                                                           : "Position"_json)
717
    .set_y_axis("Phred score"_json)
×
718
    .set_width(dynamic_width)
×
719
    .set_values(build_base_qualities(statistics, names))
×
720
    .write(output);
×
721

722
  if (config.is_read_merging_enabled() && merged) {
×
723
    html_facet_line_plot()
×
724
      .set_x_axis("Position"_json)
×
725
      .set_y_axis("Phred score"_json)
×
726
      .set_width(FIGURE_WIDTH)
×
727
      .set_values(build_base_qualities({ merged }, { "Merged" }))
×
728
      .write(output);
×
729
  }
730

731
  html_plot_title()
×
732
    .set_href(to_lower(title) + "-nucleotide-content")
×
733
    .set_title("Nucleotide content")
×
734
    .write(output);
×
735
  html_facet_line_plot()
×
736
    .set_x_axis(config.is_read_merging_enabled() && merged ? "null"
×
737
                                                           : "Position"_json)
738
    .set_y_axis("Frequency"_json)
×
739
    .set_width(dynamic_width)
×
740
    .set_values(build_base_content(statistics, names))
×
741
    .write(output);
×
742

743
  if (config.is_read_merging_enabled() && merged) {
×
744
    html_facet_line_plot()
×
745
      .set_x_axis("Position"_json)
×
746
      .set_y_axis("Frequency"_json)
×
747
      .set_width(FIGURE_WIDTH)
×
748
      .set_values(build_base_content({ merged }, { "Merged" }))
×
749
      .write(output);
×
750

751
    // Subsequent plots should include merged reads
752
    names.push_back("Merged");
×
753
    statistics.push_back(merged);
×
754
  }
755

756
  html_plot_title()
×
757
    .set_href(to_lower(title) + "-quality-scores")
×
758
    .set_title("Quality score distribution")
×
759
    .write(output);
×
760
  html_frequency_plot()
×
761
    .set_x_axis("Phred score"_json)
×
762
    .set_y_axis("Frequency"_json)
×
763
    .set_width(FIGURE_WIDTH)
×
764
    .set_values(build_quality_distribution(statistics, names))
×
765
    .write(output);
×
766

767
  {
×
768
    json_list data;
×
769

770
    for (size_t i = 0; i < statistics.size(); ++i) {
×
771
      const auto m = data.dict();
×
772
      m->str("group", names.at(i));
×
773
      m->i64("offset", 0);
×
774
      m->f64_vec("y", statistics.at(i)->gc_content());
×
775
    }
776

777
    html_plot_title()
×
778
      .set_href(to_lower(title) + "-gc-content")
×
779
      .set_title("GC Content")
×
780
      .write(output);
×
781
    html_frequency_plot()
×
782
      .set_x_axis("%GC"_json)
×
783
      .set_y_axis("Frequency"_json)
×
784
      .set_width(FIGURE_WIDTH)
×
785
      .set_values(data.to_string())
×
786
      .write(output);
×
787
  }
788
}
789

790
void
791
write_html_input_section(const userconfig& config,
×
792
                         const statistics& stats,
793
                         std::ostream& output)
794
{
795
  fastq_stats_vec stats_vec = { stats.input_1 };
×
796
  string_vec names = { "File 1" };
×
797

798
  if (config.paired_ended_mode) {
×
799
    stats_vec.push_back(stats.input_2);
×
800
    names.emplace_back("File 2");
×
801
  }
802

803
  write_html_io_section(
×
804
    config, output, "Input", std::move(stats_vec), std::move(names));
×
805
}
806

807
void
808
write_html_analyses_section(const userconfig& config,
×
809
                            const statistics& stats,
810
                            std::ostream& output)
811

812
{
813
  write_html_section_title("Analyses", output);
×
814

815
  // Insert size distribution
816
  if (config.paired_ended_mode) {
×
817
    counts insert_sizes;
×
818
    for (const auto& it : stats.trimming) {
×
819
      insert_sizes += it->insert_sizes;
×
820
    }
821

822
    json_list samples;
×
823
    const auto sample = samples.dict();
×
824
    sample->str("group", "insert_sizes");
×
825
    sample->i64("offset", 0);
×
826
    sample->i64_vec("y", insert_sizes);
×
827

828
    // FIXME: Specify "identified reads" when in demultiplexing mode and
829
    // correct format_percentage to merged / n_identified.
830
    std::ostringstream ss;
×
831
    ss << "Insert sizes inferred for "
×
832
       << format_percentage(insert_sizes.sum(),
×
833
                            stats.input_1->number_of_input_reads())
×
834
       << "% of reads";
×
835

836
    html_plot_title()
×
837
      .set_href("analyses-insert-sizes")
×
838
      .set_title("Insert-size distribution")
×
839
      .write(output);
×
840
    html_plot_sub_title().set_sub_title(ss.str()).write(output);
×
841
    html_frequency_plot()
×
842
      .set_x_axis("Insert size"_json)
×
843
      .set_y_axis("Frequency"_json)
×
844
      .set_legend("null")
×
845
      .set_width(FIGURE_WIDTH)
×
846
      .set_values(samples.to_string())
×
847
      .write(output);
×
848

849
    if (config.run_type == ar_command::report_only) {
×
850
      html_output_note()
×
851
        .set_text(
×
852
          "Insert size distribution inferred using adapter-free alignments.")
853
        .write(output);
×
854
    }
855
  }
856

857
  // Consensus adapter sequence inference
858
  if (config.paired_ended_mode && config.run_type == ar_command::report_only) {
×
859
    AR_REQUIRE(stats.adapter_id);
×
860

861
    const auto adapter_1 = stats.adapter_id->adapter1.summarize();
×
862
    const auto adapter_2 = stats.adapter_id->adapter2.summarize();
×
863

864
    // Consensus adapter sequences
865
    {
×
866
      const auto reference_adapters =
×
867
        config.samples.adapters().to_read_orientation().front();
×
868
      std::string reference_adapter_1{ reference_adapters.first };
×
869
      std::string reference_adapter_2{ reference_adapters.second };
×
870

871
      html_consensus_adapter_head()
×
872
        .set_overlapping_pairs(
×
873
          format_rough_number(stats.adapter_id->aligned_pairs))
×
874
        .set_pairs_with_adapters(
×
875
          format_rough_number(stats.adapter_id->pairs_with_adapters))
×
876
        .write(output);
×
877

878
      html_consensus_adapter_table()
×
879
        .set_name_1("--adapter1")
×
880
        .set_reference_1(reference_adapter_1)
×
881
        .set_alignment_1(adapter_1.compare_with(reference_adapter_1))
×
882
        .set_consensus_1(adapter_1.adapter().sequence())
×
883
        .set_qualities_1(adapter_1.adapter().qualities())
×
884
        .set_name_2("--adapter2")
×
885
        .set_reference_2(reference_adapter_2)
×
886
        .set_alignment_2(adapter_2.compare_with(reference_adapter_2))
×
887
        .set_consensus_2(adapter_2.adapter().sequence())
×
888
        .set_qualities_2(adapter_2.adapter().qualities())
×
889
        .write(output);
×
890
    }
891

892
    // Top N most common 5' kmers in adapter fragments
893
    {
×
894
      const auto& top_kmers_1 = adapter_1.top_kmers();
×
895
      const auto& top_kmers_2 = adapter_2.top_kmers();
×
896

897
      html_consensus_adapter_kmer_head()
×
898
        .set_n_kmers(std::to_string(consensus_adapter_stats::top_n_kmers))
×
899
        .set_kmer_length(std::to_string(consensus_adapter_stats::kmer_length))
×
900
        .write(output);
×
901

902
      const auto kmers = std::max(top_kmers_1.size(), top_kmers_2.size());
×
903
      for (size_t i = 0; i < kmers; ++i) {
×
904
        html_consensus_adapter_kmer_row row;
×
905
        row.set_index(std::to_string(i + 1));
×
906

907
        if (top_kmers_1.size() > i) {
×
908
          const auto& kmer = top_kmers_1.at(i);
×
909

910
          row.set_kmer_1(kmer.first)
×
911
            .set_count_1(format_rough_number(kmer.second))
×
912
            .set_pct_1(format_percentage(kmer.second, adapter_1.total_kmers()));
×
913
        } else {
914
          row.set_kmer_1("").set_count_1("").set_pct_1("");
×
915
        }
916

917
        if (top_kmers_2.size() > i) {
×
918
          const auto& kmer = top_kmers_2.at(i);
×
919

920
          row.set_kmer_2(kmer.first)
×
921
            .set_count_2(format_rough_number(kmer.second))
×
922
            .set_pct_2(format_percentage(kmer.second, adapter_2.total_kmers()));
×
923
        } else {
924
          row.set_kmer_2("").set_count_2("").set_pct_2("");
×
925
        }
926

927
        row.write(output);
×
928
      }
929

930
      html_consensus_adapter_kmer_tail().write(output);
×
931
    }
932
  }
933
}
934

935
std::pair<std::string, std::string>
936
join_barcodes(const sample& s)
×
937
{
938
  string_vec mate_1;
×
939
  string_vec mate_2;
×
940

941
  for (const auto& barcode : s) {
×
942
    mate_1.emplace_back(barcode.barcode_1);
×
943
    mate_2.emplace_back(barcode.barcode_2);
×
944
  }
945

946
  return {
×
947
    join_text(mate_1, "<br/>"),
×
948
    join_text(mate_2, "<br/>"),
×
949
  };
950
}
951

952
void
953
write_html_demultiplexing_section(const userconfig& config,
×
954
                                  const statistics& stats,
955
                                  std::ostream& output)
956

957
{
958
  write_html_section_title("Demultiplexing", output);
×
959

960
  json_list data;
×
961

962
  const size_t input_reads = stats.input_1->number_of_input_reads() +
×
963
                             stats.input_2->number_of_input_reads();
×
964

965
  for (size_t i = 0; i < config.samples.size(); ++i) {
×
966
    auto m = data.dict();
×
967
    m->str("x", config.samples.at(i).name());
×
968

969
    if (input_reads) {
×
970
      m->f64("y", (100.0 * stats.demultiplexing->samples.at(i)) / input_reads);
×
971
    } else {
972
      m->null("y");
×
973
    }
974
  }
975

976
  html_plot_title()
×
977
    .set_href("demux-samples")
×
978
    .set_title("Samples identified")
×
979
    .write(output);
×
980
  html_bar_plot()
×
981
    .set_x_axis("Samples"_json)
×
982
    .set_y_axis("Percent"_json)
×
983
    .set_width(FIGURE_WIDTH)
×
984
    .set_values(data.to_string())
×
985
    .write(output);
×
986

987
  html_demultiplexing_head().write(output);
×
988

989
  {
×
990
    const size_t unidentified = stats.demultiplexing->unidentified;
×
991

992
    fastq_statistics total;
×
993
    total += *stats.demultiplexing->unidentified_stats_1;
×
994
    total += *stats.demultiplexing->unidentified_stats_2;
×
995

996
    const auto output_reads = total.length_dist().sum();
×
997
    const auto output_bp = total.nucleotides_pos().sum();
×
998

999
    html_demultiplexing_row()
×
1000
      .set_n("")
×
1001
      .set_barcode_1("")
×
1002
      .set_barcode_2("")
×
1003
      .set_name("<b>Unidentified</b>")
×
1004
      .set_pct(format_percentage(unidentified, input_reads, 2))
×
1005
      .set_reads(format_rough_number(output_reads))
×
1006
      .set_bp(format_rough_number(output_bp))
×
1007
      .set_length(mean_of_bp_counts(total.length_dist()))
×
1008
      .set_gc(format_percentage(total.nucleotides_gc_pos().sum(), output_bp))
×
1009
      .write(output);
×
1010
  }
1011

1012
  size_t sample_idx = 0;
×
1013
  for (const auto& sample : config.samples) {
×
1014
    const auto& sample_stats = *stats.trimming.at(sample_idx);
×
1015

1016
    fastq_statistics total;
×
1017

1018
    total += *sample_stats.read_1;
×
1019
    total += *sample_stats.read_2;
×
1020
    total += *sample_stats.merged;
×
1021
    total += *sample_stats.singleton;
×
1022
    // Not included in overview:
1023
    // total += *sample.discarded;
1024

1025
    const auto output_reads = total.length_dist().sum();
×
1026
    const auto output_bp = total.nucleotides_pos().sum();
×
1027
    const auto barcodes = join_barcodes(sample);
×
1028

1029
    html_demultiplexing_row()
×
1030
      .set_n(std::to_string(sample_idx + 1))
×
1031
      .set_barcode_1(barcodes.first)
×
1032
      .set_barcode_2(barcodes.second)
×
1033
      .set_name(sample.name())
×
1034
      .set_pct(format_percentage(
×
1035
        stats.demultiplexing->samples.at(sample_idx), input_reads, 2))
×
1036
      .set_reads(format_rough_number(output_reads))
×
1037
      .set_bp(format_rough_number(output_bp))
×
1038
      .set_length(mean_of_bp_counts(total.length_dist()))
×
1039
      .set_gc(format_percentage(total.nucleotides_gc_pos().sum(), output_bp))
×
1040
      .write(output);
×
1041
  }
1042

1043
  html_demultiplexing_tail().write(output);
×
1044
}
1045

1046
void
1047
write_html_output_section(const userconfig& config,
×
1048
                          const statistics& stats,
1049
                          std::ostream& output)
1050

1051
{
1052
  fastq_stats_vec stats_vec;
×
1053
  string_vec names;
×
1054

1055
  auto merged = std::make_shared<fastq_statistics>();
×
1056

1057
  {
×
1058
    auto output_1 = std::make_shared<fastq_statistics>();
×
1059
    auto output_2 = std::make_shared<fastq_statistics>();
×
1060
    auto singleton = std::make_shared<fastq_statistics>();
×
1061
    auto discarded = std::make_shared<fastq_statistics>();
×
1062

1063
    for (const auto& it : stats.trimming) {
×
1064
      *output_1 += *it->read_1;
×
1065
      *output_2 += *it->read_2;
×
1066
      *merged += *it->merged;
×
1067
      *singleton += *it->singleton;
×
1068
      *discarded += *it->discarded;
×
1069
    }
1070

1071
    stats_vec.push_back(output_1);
×
1072
    names.emplace_back("Output 1");
×
1073

1074
    if (config.paired_ended_mode) {
×
1075
      stats_vec.push_back(output_2);
×
1076
      names.emplace_back("Output 2");
×
1077

1078
      if (config.is_any_filtering_enabled()) {
×
1079
        stats_vec.push_back(singleton);
×
1080
        names.emplace_back("Singleton");
×
1081
      }
1082
    }
1083

1084
    if (config.is_any_filtering_enabled()) {
×
1085
      stats_vec.push_back(discarded);
×
1086
      names.emplace_back("Discarded");
×
1087
    }
1088
  }
1089

1090
  write_html_io_section(
×
1091
    config, output, "Output", std::move(stats_vec), std::move(names), merged);
×
1092
}
1093

1094
} // namespace
1095

1096
////////////////////////////////////////////////////////////////////////////////
1097

1098
bool
1099
write_html_report(const userconfig& config,
×
1100
                  const statistics& stats,
1101
                  const std::string& filename)
1102
{
1103
  if (filename == DEV_NULL) {
×
1104
    // User disabled the report
1105
    return true;
1106
  }
1107

1108
  std::ostringstream output;
×
1109

1110
  write_html_summary_section(config, stats, output);
×
1111

1112
  if (config.run_type != ar_command::demultiplex_only &&
×
1113
      config.run_type != ar_command::report_only) {
1114
    write_html_processing_section(config, stats, output);
×
1115
  }
1116

1117
  write_html_input_section(config, stats, output);
×
1118

1119
  if (config.paired_ended_mode || config.run_type == ar_command::report_only) {
×
1120
    write_html_analyses_section(config, stats, output);
×
1121
  }
1122

1123
  if (config.is_demultiplexing_enabled()) {
×
1124
    write_html_demultiplexing_section(config, stats, output);
×
1125
  }
1126

1127
  if (config.run_type != ar_command::report_only) {
×
1128
    write_html_output_section(config, stats, output);
×
1129
  }
1130

1131
  html_body_end().write(output);
×
1132

1133
  try {
×
1134
    managed_writer writer{ filename };
×
1135
    writer.write(output.str());
×
1136
    writer.close();
×
1137
  } catch (const std::ios_base::failure& error) {
×
1138
    log::error() << "Error writing JSON report to '" << filename << "':\n"
×
1139
                 << indent_lines(error.what());
×
1140
    return false;
×
1141
  }
×
1142

1143
  return true;
×
1144
}
1145

1146
} // namespace adapterremoval
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc