• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cokelaer / sequana / 7093643877

04 Dec 2023 11:00PM UTC coverage: 73.729% (+0.3%) from 73.466%
7093643877

push

github

cokelaer
rename plot legend

0 of 1 new or added line in 1 file covered. (0.0%)

271 existing lines in 19 files now uncovered.

13398 of 18172 relevant lines covered (73.73%)

2.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.52
/sequana/scripts/coverage.py
1
#
2
#  This file is part of Sequana software
3
#
4
#  Copyright (c) 2016-2022 - Sequana Development Team
5
#
6
#  Distributed under the terms of the 3-clause BSD license.
7
#  The full license is in the LICENSE file, distributed with this software.
8
#
9
#  website: https://github.com/sequana/sequana
10
#  documentation: http://sequana.readthedocs.io
11
#
12
##############################################################################
13
""".. rubric:: Standalone application dedicated to coverage"""
3✔
14
import argparse
3✔
15
import gc as garbage
3✔
16
import glob
3✔
17
import os
3✔
18
import subprocess
3✔
19
import sys
3✔
20

21
import rich_click as click
3✔
22

23
from .utils import CONTEXT_SETTINGS
3✔
24

25

26
import colorlog
3✔
27
from easydev import AttrDict, shellcmd
3✔
28

29
from sequana import sequana_data
3✔
30
from sequana import version as sequana_version
3✔
31
from sequana.bedtools import ChromosomeCov, SequanaCoverage
3✔
32
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
3✔
33
from sequana.utils import config
3✔
34
from sequana.scripts.common import teardown
3✔
35

36
logger = colorlog.getLogger(__name__)
3✔
37

38

39
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
3✔
40

41

42
click.rich_click.OPTION_GROUPS = {
3✔
43
    "sequana_coverage": [
44
        {
45
            "name": "Required",
46
            "options": ["--input-file"],
47
        },
48
        {
49
            "name": "Detection Algorithm Options",
50
            "options": [
51
                "--circular",
52
                "--mixture-models",
53
                "--window-median",
54
                "--window-gc",
55
                "--low-threshold",
56
                "--high-threshold",
57
                "--threshold",
58
                "--cnv-clustering",
59
                "--clustering-parameter",
60
            ],
61
        },
62
        {
63
            "name": "Modifiers",
64
            "options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
65
        },
66
        {
67
            "name": "Download utilities",
68
            "options": ["--download-reference", "--download-genbank", "--database"],
69
        },
70
        {
71
            "name": "Output files",
72
            "options": ["--no-multiqc", "--output-directory"],
73
        },
74
        {
75
            "name": "Behaviour",
76
            "options": ["--version", "--level", "--debug-level", "--help"],
77
        },
78
    ],
79
}
80

81

82
def download_reference(ctx, param, value):
3✔
83
    if value:
3✔
UNCOV
84
        database = ctx.params.get("database", "ENA")
×
UNCOV
85
        click.echo(f"Downloading reference {value} from {database}...", nl=False)
×
UNCOV
86
        from bioservices.apps import download_fasta as df
×
87

88
        df.download_fasta(value, method="ENA")
×
89
        click.echo("done")
×
90
        sys.exit(0)
×
91
    return value
3✔
92

93

94
def download_genbank(ctx, param, value):
3✔
95
    if value:
3✔
UNCOV
96
        click.echo(f"Downloading genbank {value}...", nl=False)
×
UNCOV
97
        from sequana.snpeff import download_fasta_and_genbank
×
98

UNCOV
99
        download_fasta_and_genbank(value, value, genbank=True, fasta=False)
×
100
        click.echo("done")
×
101
        sys.exit(0)
×
102
    return value
3✔
103

104

105
@click.command(context_settings=CONTEXT_SETTINGS)
3✔
106
@click.option(
3✔
107
    "--database",
108
    "database",
109
    default="ENA",
110
    type=click.Choice(["ENA", "EUtils"]),
111
    is_eager=True,
112
    # expose_value=False,
113
    help="Download the reference from one of these database (default ENA)",
114
)
115
@click.option(
3✔
116
    "--download-reference",
117
    "download_reference",
118
    default=None,
119
    type=click.STRING,
120
    callback=download_reference,
121
    is_eager=True,
122
    # expose_value=False,
123
    help="Provide accession number to download a reference from NCBI/ENA",
124
)
125
@click.option(
3✔
126
    "--download-genbank",
127
    "download_genbank",
128
    default=None,
129
    type=click.STRING,
130
    callback=download_genbank,
131
    is_eager=True,
132
    help="Provide accession number to download annotation from NCBI/ENA",
133
)
134
@click.option(
3✔
135
    "-i",
136
    "--input-file",
137
    "input",
138
    type=click.Path(),
139
    required=True,
140
    help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
141
)
142
@click.option(
3✔
143
    "-a",
144
    "--annotation-file",
145
    "annotation",
146
    type=click.STRING,
147
    default=None,
148
    help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
149
)
150
@click.version_option(sequana_version)
3✔
151
@click.option(
3✔
152
    "-o",
153
    "--circular",
154
    is_flag=True,
155
    show_default=True,
156
    # help="""If the DNA of the organism is circular (typically
157
    #        viruses or bacteria), set to True"""
158
)
159
@click.option(
3✔
160
    "-c",
161
    "--chromosome",
162
    "chromosome",
163
    type=click.STRING,
164
    default="",
165
    help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
166
)
167
@click.option(
3✔
168
    "-k",
169
    "--mixture-models",
170
    "k",
171
    type=click.INT,
172
    help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
173
    default=2,
174
)
175
@click.option(
3✔
176
    "--debug-level",
177
    "logging_level",
178
    default="INFO",
179
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
180
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
181
)
182
@click.option(
3✔
183
    "--level",
184
    "logging_level",
185
    default="INFO",
186
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
187
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
188
)
189
@click.option(
3✔
190
    "-g",
191
    "--window-gc",
192
    "w_gc",
193
    type=click.INT,
194
    default=101,
195
    help="Length of the running window to compute the GC content",
196
)
197
@click.option(
3✔
198
    "--no-multiqc",
199
    "skip_multiqc",
200
    is_flag=True,
201
    show_default=True,
202
    help="Do not create any multiqc HTML page.",
203
)
204
@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")
3✔
205
@click.option(
3✔
206
    "-r",
207
    "--reference-file",
208
    "reference",
209
    type=click.STRING,
210
    default=None,
211
    help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
212
)
213
@click.option(
3✔
214
    "-w",
215
    "--window-median",
216
    "w_median",
217
    type=click.IntRange(200),
218
    help="Length of the running median window (default 20,001, recommended for bacteria).  For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
219
    default=20001,
220
)
221
@click.option(
3✔
222
    "-L",
223
    "--low-threshold",
224
    "low_threshold",
225
    default=-4,
226
    type=click.FLOAT,
227
    help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
228
)
229
@click.option(
3✔
230
    "-H",
231
    "--high-threshold",
232
    "high_threshold",
233
    default=4,
234
    type=click.FLOAT,
235
    help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
236
)
237
@click.option(
3✔
238
    "-T",
239
    "--threshold",
240
    "threshold",
241
    default=4,
242
    type=click.FLOAT,
243
    help="set lower and higher thresholds of the confidence interval. ",
244
)
245
@click.option(
3✔
246
    "-C",
247
    "--clustering-parameter",
248
    "double_threshold",
249
    default=0.5,
250
    type=click.FLOAT,
251
    help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
252
)
253
@click.option(
3✔
254
    "-s",
255
    "--chunk-size",
256
    "chunksize",
257
    type=click.IntRange(1000000),  # 1e6
258
    default=10000000,  # 1e7
259
    help="Length of the chunk to be used for the analysis. ",
260
)
261
@click.option(
3✔
262
    "-B",
263
    "--binning",
264
    "binning",
265
    type=click.IntRange(2),
266
    default=None,
267
    help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
268
)
269
@click.option(
3✔
270
    "--cnv-clustering",
271
    "cnv_clustering",
272
    default=-1,
273
    type=click.INT,
274
    help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
275
)
276
def main(**kwargs):
3✔
277
    """Welcome to SEQUANA -- Coverage standalone
278

279
        ----
280

281
       Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
282
       file. In addition, the running median used in conjunction with double thresholds
283
       extract regions of interests (ROI) for low or high coverage. A reference may be
284
       provided to plot the coverage versus GC content. An annotation file may be provided
285
       to annotate the found ROIs.
286

287
       The input file should be one of the following:
288

289
       - a BED file that is a tabulated file at least 3 columns.
290
         The first column being the reference, the second is the position
291
         and the third column contains the coverage itself.
292
       - or a BAM file that is converted automatically
293
         into a BED file using the following command:
294

295
            bedtools genomecov -d -ibam FILE.bam
296

297
       Note that this is different from another command that could perform the conversion:
298

299
           samtools depth -aa input.bam > output.bed
300

301
       If the reference is provided, an additional plot showing the coverage versus
302
       GC content is also shown.
303

304
       Here are some examples
305

306
           sequana_coverage --input-file file.bed --window-median 1001
307
           sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>
308

309
       An other interesting option is to provide a BED file with 4 columns. The
310
       fourth column being another coverage data created with a filter. One can
311
       create such a file only from the BAM file using samtools as follows given
312
       the original unfiltered BAM file named input.bam:
313

314
           samtools view -q 35  -o data.filtered.bam input.bam
315
           samtools depth input.bam data.filtered.bam  -aa > test.bed
316
           sequana_coverage --input test.bed --show-html
317

318
       Note that the first file is the filtered one, and the second file is the
319
       unfiltered one.
320

321
       Note for multi chromosome and genbank features: for now, you will need to call
322
       sequana_coverage for each chromosome individually since we accept only one
323
       genbank as input parameter:
324

325
           sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1
326

327
       Large genomes:
328
       --------------
329

330
       If your input data is large and does not fit into memory, use the --binning BIN
331
       options to average data into bin of BIN values.
332

333
       CNV cases:
334
       --------------
335

336
       By default, sequana_coverage identify events as small as 1 bin. For the CNV
337
       detection case, you may want to cluster events. the --cnv-merging DELTA option
338
       merges consecutives events whose distance is smaller that DELTA
339

340

341
    ----
342

343
    AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
344
    Documentation: http://sequana.readthedocs.io
345
    Issues: http://github.com/sequana/sequana
346
    """
347
    options = AttrDict(**kwargs)
3✔
348

349
    logger.setLevel(options.logging_level)
3✔
350

351
    if options.annotation:
3✔
352
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation
3✔
353

354
    logger.info(f"Reading {options.input}. This may take time ")
3✔
355

356
    # Convert BAM to BED
357
    if options.input.endswith(".bam"):
3✔
UNCOV
358
        bedfile = options.input.replace(".bam", ".bed")
×
UNCOV
359
        logger.info("Converting BAM into BED file")
×
UNCOV
360
        shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")
×
361
    elif options.input.endswith(".bed"):
3✔
362
        bedfile = options.input
3✔
363
    else:
UNCOV
364
        raise ValueError("Input file must be a BAM or BED file")
×
365

366
    # Set the thresholds
367
    if options.low_threshold is None:
3✔
UNCOV
368
        options.low_threshold = -options.threshold
×
369

370
    if options.high_threshold is None:
3✔
371
        options.high_threshold = options.threshold
×
372

373
    # Now we can create the instance of GenomeCoverage
374
    if "," in options.chromosome:
3✔
375
        chrom_list = options.chromosome.split(",")
×
376
    elif options.chromosome:
3✔
UNCOV
377
        chrom_list = [options.chromosome]
×
378
    else:
379
        chrom_list = []
3✔
380

381
    # initialisation (reading BED to get positions of chromosomes and chromosome names
382
    gc = SequanaCoverage(
3✔
383
        bedfile,
384
        options.annotation,
385
        options.low_threshold,
386
        options.high_threshold,
387
        options.double_threshold,
388
        options.double_threshold,
389
        chunksize=options.chunksize,
390
        chromosome_list=chrom_list,
391
        reference_file=options.reference,
392
        gc_window_size=options.w_gc,
393
    )
394

395
    # some information fo end-users,
396
    logger.info("There are %s chromosomes/contigs." % len(gc))
3✔
397
    for this in gc.chrom_names:
3✔
398
        end = gc.positions[this]["end"]
3✔
399
        start = gc.positions[this]["start"]
3✔
400
        data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
3✔
401
        logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))
3✔
402

403
    # here we read chromosome by chromosome to save memory and analyse them.
404
    N = len(gc)
3✔
405
    for i, chrom in enumerate(gc.chrom_names[::-1]):
3✔
406
        logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")
3✔
407
        # since we read just one contig/chromosome, the chr_list contains
408
        # only one contig, so we access to it with index 0
409

410
        # Performs the computation and reporting for a given chromosome
411
        # This call performs the analysis, and creates the HTML page
412
        # if HTML is creates, it fills the gc._html_list variable
413
        chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
3✔
414
        run_analysis(chrom_data, options)
3✔
415

416
        del chrom_data  # free memory for sure
3✔
417
        garbage.collect()
3✔
418

419
        # logging level seems to be reset to warning somewhere
420
        logger.setLevel(options.logging_level)
3✔
421

422

423
    CoverageModule(gc)
3✔
424

425
    if options.skip_multiqc is False:
3✔
426
        logger.info("Creating multiqc report")
3✔
427
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
3✔
428
        cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "
3✔
429

430
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
3✔
431
        proc.wait()
3✔
432

433
    teardown(options.output_directory)
3✔
434

435

436
def run_analysis(chrom, options):
3✔
437
    logger.info("Computing some metrics")
3✔
438

439
    if options.w_median > len(chrom) / 4:
3✔
UNCOV
440
        NW = int(len(chrom) / 4)
×
UNCOV
441
        if NW % 2 == 0:
×
UNCOV
442
            NW += 1
×
UNCOV
443
        logger.warning(
×
444
            "median window length is too long. \n"
445
            "    Setting the window length automatically to a fifth of\n"
446
            "    the chromosome length ({})".format(NW)
447
        )
448
    else:
449
        NW = options.w_median
3✔
450

451
    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
452
    config.output_dir = options.output_directory
3✔
453
    config.sample_name = os.path.basename(options.input).split(".")[0]
3✔
454

455
    directory = options.output_directory
3✔
456
    os.makedirs(directory, exist_ok=True)
3✔
457
    directory = f"{options.output_directory}/{chrom.chrom_name}"
3✔
458
    os.makedirs(directory, exist_ok=True)
3✔
459
    #########################################################################
460

461
    # compute the running median, zscore and ROIs for each chunk summarizing the
462
    # results in a ChromosomeCovMultiChunk instane
463
    logger.info("Using running median (w=%s)" % NW)
3✔
464
    logger.info("Number of mixture models %s " % options.k)
3✔
465
    results = chrom.run(
3✔
466
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
467
    )
468
    chrom.plot_coverage(f"{directory}/coverage.png")
3✔
469

470
    if chrom.DOC < 8:
3✔
UNCOV
471
        logger.warning(
×
472
            "The depth of coverage is below 8. sequana_coverage is"
473
            " not optimised for such depth. You may want to "
474
            " increase the threshold to avoid too many false detections"
475
        )
476
    logger.info(chrom.__str__())
3✔
477

478
    # save summary and metrics
479
    summary = results.get_summary(caller="sequana_coverage")
3✔
480
    summary.to_json(f"{directory}/sequana_summary_coverage.json")
3✔
481

482
    mu = summary.data.get("fit_mu", 0)
3✔
483
    sigma = summary.data.get("fit_sigma", 0)
3✔
484
    pi = summary.data.get("pi", 0)
3✔
485

486
    logger.info(
3✔
487
        "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
488
        % (round(mu, 3), round(sigma, 3), round(pi, 3))
489
        )
490

491
    # some information about the ROIs found
492
    logger.info(
3✔
493
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
494
            chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
495
        )
496
    )
497
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instance
3✔
498
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
3✔
499
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
3✔
500
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))
3✔
501

502
    # Create directory and save ROIs
503
    ROIs.df.to_csv(f"{directory}/rois.csv")
3✔
504

505

506
    logger.info(f"Creating report in {options.output_directory}. Please wait")
3✔
507

508
    if chrom._mode == "chunks":
3✔
UNCOV
509
        logger.warning("This chromosome is large. Plots in the HTML reports are skipped")
×
510

511
    ChromosomeCoverageModule(
3✔
512
        chrom,
513
        datatable=CoverageModule.init_roi_datatable(ROIs),
514
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
515
        command=" ".join(["sequana_coverage"] + sys.argv[1:]), 
516
    )
517

518

519
if __name__ == "__main__":  # pragma: no cover
520
    main()
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc