• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cokelaer / sequana / 6851111417

13 Nov 2023 02:09PM UTC coverage: 73.547%. Remained the same
6851111417

push

github

cokelaer
Refactorised the coverage tools (sequana_coverage)

160 of 296 new or added lines in 27 files covered. (54.05%)

496 existing lines in 25 files now uncovered.

13245 of 18009 relevant lines covered (73.55%)

2.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.88
/sequana/scripts/coverage.py
1
#
2
#  This file is part of Sequana software
3
#
4
#  Copyright (c) 2016-2022 - Sequana Development Team
5
#
6
#  Distributed under the terms of the 3-clause BSD license.
7
#  The full license is in the LICENSE file, distributed with this software.
8
#
9
#  website: https://github.com/sequana/sequana
10
#  documentation: http://sequana.readthedocs.io
11
#
12
##############################################################################
13
""".. rubric:: Standalone application dedicated to coverage"""
3✔
14
import argparse
3✔
15
import gc as garbage
3✔
16
import glob
3✔
17
import os
3✔
18
import subprocess
3✔
19
import sys
3✔
20

21
import rich_click as click
3✔
22

23
click.rich_click.USE_MARKDOWN = True
3✔
24
click.rich_click.SHOW_METAVARS_COLUMN = False
3✔
25
click.rich_click.APPEND_METAVARS_HELP = True
3✔
26
click.rich_click.STYLE_ERRORS_SUGGESTION = "magenta italic"
3✔
27
click.rich_click.SHOW_ARGUMENTS = True
3✔
28

29

30
import colorlog
3✔
31
from easydev import AttrDict, shellcmd
3✔
32

33
from sequana import sequana_data
3✔
34
from sequana import version as sequana_version
3✔
35
from sequana.bedtools import ChromosomeCov, SequanaCoverage
3✔
36
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
3✔
37
from sequana.utils import config
3✔
38
from sequana.scripts.common import teardown
3✔
39

40
logger = colorlog.getLogger(__name__)
3✔
41

42

43
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
3✔
44

45

46
click.rich_click.OPTION_GROUPS = {
3✔
47
    "sequana_coverage": [
48
        {
49
            "name": "Required",
50
            "options": ["--input-file"],
51
        },
52
        {
53
            "name": "Detection Algorithm Options",
54
            "options": [
55
                "--circular",
56
                "--mixture-models",
57
                "--window-median",
58
                "--window-gc",
59
                "--low-threshold",
60
                "--high-threshold",
61
                "--threshold",
62
                "--cnv-clustering",
63
                "--clustering-parameter",
64
            ],
65
        },
66
        {
67
            "name": "Modifiers",
68
            "options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
69
        },
70
        {
71
            "name": "Download utilities",
72
            "options": ["--download-reference", "--download-genbank", "--database"],
73
        },
74
        {
75
            "name": "Output files",
76
            "options": ["--no-multiqc", "--output-directory"],
77
        },
78
        {
79
            "name": "Behaviour",
80
            "options": ["--version", "--level", "--debug-level", "--help"],
81
        },
82
    ],
83
}
84

85

86
def download_reference(ctx, param, value):
3✔
87
    if value:
3✔
88
        database = ctx.params.get("database", "ENA")
×
89
        click.echo(f"Downloading reference {value} from {database}...", nl=False)
×
90
        from bioservices.apps import download_fasta as df
×
91

92
        df.download_fasta(value, method="ENA")
×
93
        click.echo("done")
×
94
        sys.exit(0)
×
95
    return value
3✔
96

97

98
def download_genbank(ctx, param, value):
3✔
99
    if value:
3✔
100
        click.echo(f"Downloading genbank {value}...", nl=False)
×
101
        from sequana.snpeff import download_fasta_and_genbank
×
102

103
        download_fasta_and_genbank(value, value, genbank=True, fasta=False)
×
104
        click.echo("done")
×
105
        sys.exit(0)
×
106
    return value
3✔
107

108

109
@click.command(context_settings=CONTEXT_SETTINGS)
3✔
110
@click.option(
3✔
111
    "--database",
112
    "database",
113
    default="ENA",
114
    type=click.Choice(["ENA", "EUtils"]),
115
    is_eager=True,
116
    # expose_value=False,
117
    help="Download the reference from one of these database (default ENA)",
118
)
119
@click.option(
3✔
120
    "--download-reference",
121
    "download_reference",
122
    default=None,
123
    type=click.STRING,
124
    callback=download_reference,
125
    is_eager=True,
126
    # expose_value=False,
127
    help="Provide accession number to download a reference from NCBI/ENA",
128
)
129
@click.option(
3✔
130
    "--download-genbank",
131
    "download_genbank",
132
    default=None,
133
    type=click.STRING,
134
    callback=download_genbank,
135
    is_eager=True,
136
    help="Provide accession number to download annotation from NCBI/ENA",
137
)
138
@click.option(
3✔
139
    "-i",
140
    "--input-file",
141
    "input",
142
    type=click.Path(),
143
    required=True,
144
    help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
145
)
146
@click.option(
3✔
147
    "-a",
148
    "--annotation-file",
149
    "annotation",
150
    type=click.STRING,
151
    default=None,
152
    help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
153
)
154
@click.version_option(sequana_version)
3✔
155
@click.option(
3✔
156
    "-o",
157
    "--circular",
158
    is_flag=True,
159
    show_default=True,
160
    # help="""If the DNA of the organism is circular (typically
161
    #        viruses or bacteria), set to True"""
162
)
163
@click.option(
3✔
164
    "-c",
165
    "--chromosome",
166
    "chromosome",
167
    type=click.STRING,
168
    default="",
169
    help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
170
)
171
@click.option(
3✔
172
    "-k",
173
    "--mixture-models",
174
    "k",
175
    type=click.INT,
176
    help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
177
    default=2,
178
)
179
@click.option(
3✔
180
    "--debug-level",
181
    "logging_level",
182
    default="INFO",
183
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
184
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
185
)
186
@click.option(
3✔
187
    "--level",
188
    "logging_level",
189
    default="INFO",
190
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
191
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
192
)
193
@click.option(
3✔
194
    "-g",
195
    "--window-gc",
196
    "w_gc",
197
    type=click.INT,
198
    default=101,
199
    help="Length of the running window to compute the GC content",
200
)
201
@click.option(
3✔
202
    "--no-multiqc",
203
    "skip_multiqc",
204
    is_flag=True,
205
    show_default=True,
206
    help="Do not create any multiqc HTML page.",
207
)
208
#FIXME currently broken. requires complete refactoring of coverage module
209
#@click.option(
210
#    "skip_html",
211
#    is_flag=True,
212
#    show_default=False,
213
#    help="Do not create any HTML reports. Save ROIs and statistics only.",
214
#)
215
@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")
3✔
216
@click.option(
3✔
217
    "-r",
218
    "--reference-file",
219
    "reference",
220
    type=click.STRING,
221
    default=None,
222
    help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
223
)
224
@click.option(
3✔
225
    "-w",
226
    "--window-median",
227
    "w_median",
228
    type=click.IntRange(200),
229
    help="Length of the running median window (default 20,001, recommended for bacteria).  For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
230
    default=20001,
231
)
232
@click.option(
3✔
233
    "-L",
234
    "--low-threshold",
235
    "low_threshold",
236
    default=-4,
237
    type=click.FLOAT,
238
    help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
239
)
240
@click.option(
3✔
241
    "-H",
242
    "--high-threshold",
243
    "high_threshold",
244
    default=4,
245
    type=click.FLOAT,
246
    help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
247
)
248
@click.option(
3✔
249
    "-T",
250
    "--threshold",
251
    "threshold",
252
    default=4,
253
    type=click.FLOAT,
254
    help="set lower and higher thresholds of the confidence interval. ",
255
)
256
@click.option(
3✔
257
    "-C",
258
    "--clustering-parameter",
259
    "double_threshold",
260
    default=0.5,
261
    type=click.FLOAT,
262
    help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
263
)
264
@click.option(
3✔
265
    "-s",
266
    "--chunk-size",
267
    "chunksize",
268
    type=click.IntRange(1000000),  # 1e6
269
    default=10000000,  # 1e7
270
    help="Length of the chunk to be used for the analysis. ",
271
)
272
@click.option(
3✔
273
    "-B",
274
    "--binning",
275
    "binning",
276
    type=click.IntRange(2),
277
    default=None,
278
    help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
279
)
280
@click.option(
3✔
281
    "--cnv-clustering",
282
    "cnv_clustering",
283
    default=-1,
284
    type=click.INT,
285
    help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
286
)
287
def main(**kwargs):
3✔
288
    """Welcome to SEQUANA -- Coverage standalone
289

290
        ----
291

292
       Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
293
       file. In addition, the running median used in conjunction with double thresholds
294
       extract regions of interests (ROI) for low or high coverage. A reference may be
295
       provided to plot the coverage versus GC content. An annotation file may be provided
296
       to annotate the found ROIs.
297

298
       The input file should be one of the following:
299

300
       - a BED file that is a tabulated file at least 3 columns.
301
         The first column being the reference, the second is the position
302
         and the third column contains the coverage itself.
303
       - or a BAM file that is converted automatically
304
         into a BED file using the following command:
305

306
            bedtools genomecov -d -ibam FILE.bam
307

308
       Note that this is different from another command that could perform the conversion:
309

310
           samtools depth -aa input.bam > output.bed
311

312
       If the reference is provided, an additional plot showing the coverage versus
313
       GC content is also shown.
314

315
       Here are some examples
316

317
           sequana_coverage --input-file file.bed --window-median 1001
318
           sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>
319

320
       An other interesting option is to provide a BED file with 4 columns. The
321
       fourth column being another coverage data created with a filter. One can
322
       create such a file only from the BAM file using samtools as follows given
323
       the original unfiltered BAM file named input.bam:
324

325
           samtools view -q 35  -o data.filtered.bam input.bam
326
           samtools depth input.bam data.filtered.bam  -aa > test.bed
327
           sequana_coverage --input test.bed --show-html
328

329
       Note that the first file is the filtered one, and the second file is the
330
       unfiltered one.
331

332
       Note for multi chromosome and genbank features: for now, you will need to call
333
       sequana_coverage for each chromosome individually since we accept only one
334
       genbank as input parameter:
335

336
           sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1
337

338
       Large genomes:
339
       --------------
340

341
       If your input data is large and does not fit into memory, use the --binning BIN
342
       options to average data into bin of BIN values.
343

344
       CNV cases:
345
       --------------
346

347
       By default, sequana_coverage identify events as small as 1 bin. For the CNV
348
       detection case, you may want to cluster events. the --cnv-merging DELTA option
349
       merges consecutives events whose distance is smaller that DELTA
350

351

352
    ----
353

354
    AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
355
    Documentation: http://sequana.readthedocs.io
356
    Issues: http://github.com/sequana/sequana
357
    """
358
    options = AttrDict(**kwargs)
3✔
359

360
    logger.setLevel(options.logging_level)
3✔
361

362
    if options.annotation:
3✔
363
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation
3✔
364

365
    logger.info(f"Reading {options.input}. This may take time ")
3✔
366

367
    # Convert BAM to BED
368
    if options.input.endswith(".bam"):
3✔
369
        bedfile = options.input.replace(".bam", ".bed")
×
370
        logger.info("Converting BAM into BED file")
×
371
        shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")
×
372
    elif options.input.endswith(".bed"):
3✔
373
        bedfile = options.input
3✔
374
    else:
375
        raise ValueError("Input file must be a BAM or BED file")
×
376

377
    # Set the thresholds
378
    if options.low_threshold is None:
3✔
379
        options.low_threshold = -options.threshold
×
380

381
    if options.high_threshold is None:
3✔
382
        options.high_threshold = options.threshold
×
383

384
    # Now we can create the instance of GenomeCoverage
385
    if "," in options.chromosome:
3✔
386
        chrom_list = options.chromosome.split(",")
×
387
    elif options.chromosome:
3✔
388
        chrom_list = [options.chromosome]
×
389
    else:
390
        chrom_list = []
3✔
391

392
    # initialisation (reading BED to get positions of chromosomes and chromosome names
393
    gc = SequanaCoverage(
3✔
394
        bedfile,
395
        options.annotation,
396
        options.low_threshold,
397
        options.high_threshold,
398
        options.double_threshold,
399
        options.double_threshold,
400
        chunksize=options.chunksize,
401
        chromosome_list=chrom_list,
402
        reference_file=options.reference,
403
        gc_window_size=options.w_gc,
404
    )
405

406
    # some information fo end-users,
407
    logger.info("There are %s chromosomes/contigs." % len(gc))
3✔
408
    for this in gc.chrom_names:
3✔
409
        end = gc.positions[this]["end"]
3✔
410
        start = gc.positions[this]["start"]
3✔
411
        data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
3✔
412
        logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))
3✔
413

414
    # here we read chromosome by chromosome to save memory and analyse them.
415
    N = len(gc)
3✔
416
    for i, chrom in enumerate(gc.chrom_names[::-1]):
3✔
417
        logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")
3✔
418
        # since we read just one contig/chromosome, the chr_list contains
419
        # only one contig, so we access to it with index 0
420

421
        # Performs the computation and reporting for a given chromosome
422
        # This call performs the analysis, and creates the HTML page
423
        # if HTML is creates, it fills the gc._html_list variable
424
        chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
3✔
425
        run_analysis(chrom_data, options)
3✔
426

427
        del chrom_data  # free memory for sure
3✔
428
        garbage.collect()
3✔
429

430
        # logging level seems to be reset to warning somewhere
431
        logger.setLevel(options.logging_level)
3✔
432

433
    # ugly but works for now. extra output_directory must be removed
434
    #html_list = list(glob.glob(f"{options.output_directory}/*/*.cov.html"))
435
    #html_list = [x.replace(f"{options.output_directory}/", "") for x in html_list]
436

437
    CoverageModule(gc)
3✔
438

439
    if options.skip_multiqc is False:
3✔
440
        logger.info("Creating multiqc report")
3✔
441
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
3✔
442
        cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "
3✔
443

444
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
3✔
445
        proc.wait()
3✔
446

447
    teardown(options.output_directory)
3✔
448

449

450
def run_analysis(chrom, options):
3✔
451
    logger.info("Computing some metrics")
3✔
452

453
    if options.w_median > len(chrom) / 4:
3✔
UNCOV
454
        NW = int(len(chrom) / 4)
×
UNCOV
455
        if NW % 2 == 0:
×
UNCOV
456
            NW += 1
×
457
        logger.warning(
×
458
            "median window length is too long. \n"
459
            "    Setting the window length automatically to a fifth of\n"
460
            "    the chromosome length ({})".format(NW)
461
        )
462
    else:
463
        NW = options.w_median
3✔
464

465
    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
466
    config.output_dir = options.output_directory
3✔
467
    config.sample_name = os.path.basename(options.input).split(".")[0]
3✔
468

469
    directory = options.output_directory
3✔
470
    os.makedirs(directory, exist_ok=True)
3✔
471
    directory = f"{options.output_directory}/{chrom.chrom_name}"
3✔
472
    os.makedirs(directory, exist_ok=True)
3✔
473
    #########################################################################
474

475
    # compute the running median, zscore and ROIs for each chunk summarizing the
476
    # results in a ChromosomeCovMultiChunk instane
477
    logger.info("Using running median (w=%s)" % NW)
3✔
478
    logger.info("Number of mixture models %s " % options.k)
3✔
479
    results = chrom.run(
3✔
480
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
481
    )
482

483
    if chrom.DOC < 8:
3✔
UNCOV
484
        logger.warning(
×
485
            "The depth of coverage is below 8. sequana_coverage is"
486
            " not optimised for such depth. You may want to "
487
            " increase the threshold to avoid too many false detections"
488
        )
489
    logger.info(chrom.__str__())
3✔
490

491
    # save summary and metrics
492
    summary = results.get_summary(caller="sequana_coverage")
3✔
493
    summary.to_json(f"{directory}/sequana_summary_coverage.json")
3✔
494

495
    mu = summary.data.get("fit_mu", 0)
3✔
496
    sigma = summary.data.get("fit_sigma", 0)
3✔
497
    pi = summary.data.get("pi", 0)
3✔
498

499
    logger.info(
3✔
500
        "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
501
        % (round(mu, 3), round(sigma, 3), round(pi, 3))
502
        )
503

504
    # some information about the ROIs found
505
    logger.info(
3✔
506
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
507
            chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
508
        )
509
    )
510
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instance
3✔
511
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
3✔
512
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
3✔
513
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))
3✔
514

515
    # Create directory and save ROIs
516
    ROIs.df.to_csv(f"{directory}/rois.csv")
3✔
517

518

519
    logger.info(f"Creating report in {options.output_directory}. Please wait")
3✔
520

521
    if chrom._mode == "chunks":
3✔
UNCOV
522
        logger.warning("This chromosome is large. Plots in the HTML reports are skipped")
×
523

524
    ChromosomeCoverageModule(
3✔
525
        chrom,
526
        datatable=CoverageModule.init_roi_datatable(ROIs),
527
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
528
        command=" ".join(["sequana_coverage"] + sys.argv[1:]), 
529
    )
530

531

532
if __name__ == "__main__":  # pragma: no cover
533
    main()
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc