• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cokelaer / sequana / 7805667516

06 Feb 2024 08:25PM UTC coverage: 75.585%. Remained the same
7805667516

push

github

cokelaer
Fix deprecated warning

2 of 2 new or added lines in 1 file covered. (100.0%)

12 existing lines in 1 file now uncovered.

13829 of 18296 relevant lines covered (75.58%)

2.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.52
/sequana/scripts/coverage.py
1
#
2
#  This file is part of Sequana software
3
#
4
#  Copyright (c) 2016-2022 - Sequana Development Team
5
#
6
#  Distributed under the terms of the 3-clause BSD license.
7
#  The full license is in the LICENSE file, distributed with this software.
8
#
9
#  website: https://github.com/sequana/sequana
10
#  documentation: http://sequana.readthedocs.io
11
#
12
##############################################################################
13
""".. rubric:: Standalone application dedicated to coverage"""
3✔
14
import argparse
3✔
15
import gc as garbage
3✔
16
import glob
3✔
17
import os
3✔
18
import subprocess
3✔
19
import sys
3✔
20

21
import colorlog
3✔
22
import rich_click as click
3✔
23
from easydev import AttrDict, shellcmd
3✔
24

25
from sequana import sequana_data
3✔
26
from sequana import version as sequana_version
3✔
27
from sequana.bedtools import ChromosomeCov, SequanaCoverage
3✔
28
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
3✔
29
from sequana.scripts.common import teardown
3✔
30
from sequana.utils import config
3✔
31

32
from .utils import CONTEXT_SETTINGS
3✔
33

34
logger = colorlog.getLogger(__name__)
3✔
35

36

37
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
3✔
38

39

40
click.rich_click.OPTION_GROUPS = {
3✔
41
    "sequana_coverage": [
42
        {
43
            "name": "Required",
44
            "options": ["--input-file"],
45
        },
46
        {
47
            "name": "Detection Algorithm Options",
48
            "options": [
49
                "--circular",
50
                "--mixture-models",
51
                "--window-median",
52
                "--window-gc",
53
                "--low-threshold",
54
                "--high-threshold",
55
                "--threshold",
56
                "--cnv-clustering",
57
                "--clustering-parameter",
58
            ],
59
        },
60
        {
61
            "name": "Modifiers",
62
            "options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
63
        },
64
        {
65
            "name": "Download utilities",
66
            "options": ["--download-reference", "--download-genbank", "--database"],
67
        },
68
        {
69
            "name": "Output files",
70
            "options": ["--no-multiqc", "--output-directory"],
71
        },
72
        {
73
            "name": "Behaviour",
74
            "options": ["--version", "--level", "--debug-level", "--help"],
75
        },
76
    ],
77
}
78

79

80
def download_reference(ctx, param, value):
3✔
81
    if value:
3✔
82
        database = ctx.params.get("database", "ENA")
×
83
        click.echo(f"Downloading reference {value} from {database}...", nl=False)
×
84
        from bioservices.apps import download_fasta as df
×
85

86
        df.download_fasta(value, method="ENA")
×
87
        click.echo("done")
×
88
        sys.exit(0)
×
89
    return value
3✔
90

91

92
def download_genbank(ctx, param, value):
3✔
93
    if value:
3✔
94
        click.echo(f"Downloading genbank {value}...", nl=False)
×
95
        from sequana.snpeff import download_fasta_and_genbank
×
96

97
        download_fasta_and_genbank(value, value, genbank=True, fasta=False)
×
98
        click.echo("done")
×
99
        sys.exit(0)
×
100
    return value
3✔
101

102

103
@click.command(context_settings=CONTEXT_SETTINGS)
3✔
104
@click.option(
3✔
105
    "--database",
106
    "database",
107
    default="ENA",
108
    show_default=True,
109
    type=click.Choice(["ENA", "EUtils"]),
110
    is_eager=True,
111
    # expose_value=False,
112
    help="Download the reference from one of these database (default ENA)",
113
)
114
@click.option(
3✔
115
    "--download-reference",
116
    "download_reference",
117
    default=None,
118
    type=click.STRING,
119
    callback=download_reference,
120
    is_eager=True,
121
    # expose_value=False,
122
    help="Provide accession number to download a reference from NCBI/ENA",
123
)
124
@click.option(
3✔
125
    "--download-genbank",
126
    "download_genbank",
127
    default=None,
128
    type=click.STRING,
129
    callback=download_genbank,
130
    is_eager=True,
131
    help="Provide accession number to download annotation from NCBI/ENA",
132
)
133
@click.option(
3✔
134
    "-i",
135
    "--input-file",
136
    "input",
137
    type=click.Path(),
138
    required=True,
139
    help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
140
)
141
@click.option(
3✔
142
    "-a",
143
    "--annotation-file",
144
    "annotation",
145
    type=click.STRING,
146
    default=None,
147
    help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
148
)
149
@click.version_option(sequana_version)
3✔
150
@click.option(
3✔
151
    "-o",
152
    "--circular",
153
    is_flag=True,
154
    help="""If the DNA of the organism is circular (typically
155
            viruses or bacteria), set to True""",
156
)
157
@click.option(
3✔
158
    "-c",
159
    "--chromosome",
160
    "chromosome",
161
    type=click.STRING,
162
    default="",
163
    show_default=True,
164
    help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
165
)
166
@click.option(
3✔
167
    "-k",
168
    "--mixture-models",
169
    "k",
170
    type=click.INT,
171
    help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
172
    default=2,
173
    show_default=True,
174
)
175
@click.option(
3✔
176
    "--debug-level",
177
    "logging_level",
178
    default="INFO",
179
    show_default=True,
180
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
181
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
182
)
183
@click.option(
3✔
184
    "--level",
185
    "logging_level",
186
    default="INFO",
187
    show_default=True,
188
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
189
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
190
)
191
@click.option(
3✔
192
    "-g",
193
    "--window-gc",
194
    "w_gc",
195
    type=click.INT,
196
    default=101,
197
    show_default=True,
198
    help="Length of the running window to compute the GC content",
199
)
200
@click.option(
3✔
201
    "--no-multiqc",
202
    "skip_multiqc",
203
    is_flag=True,
204
    show_default=True,
205
    help="Do not create any multiqc HTML page.",
206
)
207
@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")
3✔
208
@click.option(
3✔
209
    "-r",
210
    "--reference-file",
211
    "reference",
212
    type=click.STRING,
213
    default=None,
214
    help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
215
)
216
@click.option(
3✔
217
    "-w",
218
    "--window-median",
219
    "w_median",
220
    type=click.IntRange(200),
221
    help="Length of the running median window (default 20,001, recommended for bacteria).  For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
222
    default=20001,
223
    show_default=True,
224
)
225
@click.option(
3✔
226
    "-L",
227
    "--low-threshold",
228
    "low_threshold",
229
    default=-4,
230
    show_default=True,
231
    type=click.FLOAT,
232
    help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
233
)
234
@click.option(
3✔
235
    "-H",
236
    "--high-threshold",
237
    "high_threshold",
238
    default=4,
239
    show_default=True,
240
    type=click.FLOAT,
241
    help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
242
)
243
@click.option(
3✔
244
    "-T",
245
    "--threshold",
246
    "threshold",
247
    default=4,
248
    type=click.FLOAT,
249
    show_default=True,
250
    help="set lower and higher thresholds of the confidence interval. ",
251
)
252
@click.option(
3✔
253
    "-C",
254
    "--clustering-parameter",
255
    "double_threshold",
256
    default=0.5,
257
    show_default=True,
258
    type=click.FLOAT,
259
    help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
260
)
261
@click.option(
3✔
262
    "-s",
263
    "--chunk-size",
264
    "chunksize",
265
    type=click.IntRange(1000000),  # 1e6
266
    default=10000000,  # 1e7
267
    show_default=True,
268
    help="Length of the chunk to be used for the analysis. ",
269
)
270
@click.option(
3✔
271
    "-B",
272
    "--binning",
273
    "binning",
274
    type=click.IntRange(2),
275
    default=None,
276
    help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
277
)
278
@click.option(
3✔
279
    "--cnv-clustering",
280
    "cnv_clustering",
281
    default=-1,
282
    type=click.INT,
283
    help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
284
)
285
def main(**kwargs):
3✔
286
    """Welcome to SEQUANA -- Coverage standalone
287

288
        ----
289

290
       Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
291
       file. In addition, the running median used in conjunction with double thresholds
292
       extract regions of interests (ROI) for low or high coverage. A reference may be
293
       provided to plot the coverage versus GC content. An annotation file may be provided
294
       to annotate the found ROIs.
295

296
       The input file should be one of the following:
297

298
       - a BED file that is a tabulated file at least 3 columns.
299
         The first column being the reference, the second is the position
300
         and the third column contains the coverage itself.
301
       - or a BAM file that is converted automatically
302
         into a BED file using the following command:
303

304
            bedtools genomecov -d -ibam FILE.bam
305

306
       Note that this is different from another command that could perform the conversion:
307

308
           samtools depth -aa input.bam > output.bed
309

310
       If the reference is provided, an additional plot showing the coverage versus
311
       GC content is also shown.
312

313
       Here are some examples
314

315
           sequana_coverage --input-file file.bed --window-median 1001
316
           sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>
317

318
       An other interesting option is to provide a BED file with 4 columns. The
319
       fourth column being another coverage data created with a filter. One can
320
       create such a file only from the BAM file using samtools as follows given
321
       the original unfiltered BAM file named input.bam:
322

323
           samtools view -q 35  -o data.filtered.bam input.bam
324
           samtools depth input.bam data.filtered.bam  -aa > test.bed
325
           sequana_coverage --input test.bed --show-html
326

327
       Note that the first file is the filtered one, and the second file is the
328
       unfiltered one.
329

330
       Note for multi chromosome and genbank features: for now, you will need to call
331
       sequana_coverage for each chromosome individually since we accept only one
332
       genbank as input parameter:
333

334
           sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1
335

336
       Large genomes:
337
       --------------
338

339
       If your input data is large and does not fit into memory, use the --binning BIN
340
       options to average data into bin of BIN values.
341

342
       CNV cases:
343
       --------------
344

345
       By default, sequana_coverage identify events as small as 1 bin. For the CNV
346
       detection case, you may want to cluster events. the --cnv-merging DELTA option
347
       merges consecutives events whose distance is smaller that DELTA
348

349

350
    ----
351

352
    AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
353
    Documentation: http://sequana.readthedocs.io
354
    Issues: http://github.com/sequana/sequana
355
    """
356
    options = AttrDict(**kwargs)
3✔
357

358
    logger.setLevel(options.logging_level)
3✔
359

360
    if options.annotation:
3✔
361
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation
3✔
362

363
    logger.info(f"Reading {options.input}. This may take time ")
3✔
364

365
    # Convert BAM to BED
366
    if options.input.endswith(".bam"):
3✔
UNCOV
367
        bedfile = options.input.replace(".bam", ".bed")
×
UNCOV
368
        logger.info("Converting BAM into BED file")
×
369
        shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")
×
370
    elif options.input.endswith(".bed"):
3✔
371
        bedfile = options.input
3✔
372
    else:
373
        raise ValueError("Input file must be a BAM or BED file")
×
374

375
    # Set the thresholds
376
    if options.low_threshold is None:
3✔
UNCOV
377
        options.low_threshold = -options.threshold
×
378

379
    if options.high_threshold is None:
3✔
UNCOV
380
        options.high_threshold = options.threshold
×
381

382
    # Now we can create the instance of GenomeCoverage
383
    if "," in options.chromosome:
3✔
UNCOV
384
        chrom_list = options.chromosome.split(",")
×
385
    elif options.chromosome:
3✔
UNCOV
386
        chrom_list = [options.chromosome]
×
387
    else:
388
        chrom_list = []
3✔
389

390
    # initialisation (reading BED to get positions of chromosomes and chromosome names
391
    gc = SequanaCoverage(
3✔
392
        bedfile,
393
        options.annotation,
394
        options.low_threshold,
395
        options.high_threshold,
396
        options.double_threshold,
397
        options.double_threshold,
398
        chunksize=options.chunksize,
399
        chromosome_list=chrom_list,
400
        reference_file=options.reference,
401
        gc_window_size=options.w_gc,
402
        # force=True
403
    )
404

405
    # some information fo end-users,
406
    logger.info("There are %s chromosomes/contigs." % len(gc))
3✔
407
    for this in gc.chrom_names:
3✔
408
        end = gc.positions[this]["end"]
3✔
409
        start = gc.positions[this]["start"]
3✔
410
        data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
3✔
411
        logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))
3✔
412

413
    # here we read chromosome by chromosome to save memory and analyse them.
414
    N = len(gc)
3✔
415
    for i, chrom in enumerate(gc.chrom_names[::-1]):
3✔
416
        logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")
3✔
417
        # since we read just one contig/chromosome, the chr_list contains
418
        # only one contig, so we access to it with index 0
419

420
        # Performs the computation and reporting for a given chromosome
421
        # This call performs the analysis, and creates the HTML page
422
        # if HTML is created, it fills the gc._html_list variable
423
        chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
3✔
424
        run_analysis(chrom_data, options)
3✔
425

426
        del chrom_data  # free memory for sure
3✔
427
        garbage.collect()
3✔
428

429
        # logging level seems to be reset to warning somewhere
430
        logger.setLevel(options.logging_level)
3✔
431

432
    CoverageModule(gc)
3✔
433

434
    if options.skip_multiqc is False:
3✔
435
        logger.info("Creating multiqc report")
3✔
436
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
3✔
437
        cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "
3✔
438

439
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
3✔
440
        proc.wait()
3✔
441

442
    teardown(options.output_directory)
3✔
443

444

445
def run_analysis(chrom, options):
3✔
446
    logger.info("Computing some metrics")
3✔
447

448
    if options.w_median > len(chrom) / 4:
3✔
UNCOV
449
        NW = int(len(chrom) / 4)
×
UNCOV
450
        if NW % 2 == 0:
×
UNCOV
451
            NW += 1
×
UNCOV
452
        logger.warning(
×
453
            "median window length is too long. \n"
454
            "    Setting the window length automatically to a fifth of\n"
455
            "    the chromosome length ({})".format(NW)
456
        )
457
    else:
458
        NW = options.w_median
3✔
459

460
    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
461
    config.output_dir = options.output_directory
3✔
462
    config.sample_name = os.path.basename(options.input).split(".")[0]
3✔
463

464
    directory = options.output_directory
3✔
465
    os.makedirs(directory, exist_ok=True)
3✔
466
    directory = f"{options.output_directory}/{chrom.chrom_name}"
3✔
467
    os.makedirs(directory, exist_ok=True)
3✔
468
    #########################################################################
469

470
    # compute the running median, zscore and ROIs for each chunk summarizing the
471
    # results in a ChromosomeCovMultiChunk instane
472
    logger.info("Using running median (w=%s)" % NW)
3✔
473
    logger.info("Number of mixture models %s " % options.k)
3✔
474
    results = chrom.run(
3✔
475
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
476
    )
477
    chrom.plot_coverage(f"{directory}/coverage.png")
3✔
478

479
    if chrom.DOC < 8:
3✔
UNCOV
480
        logger.warning(
×
481
            "The depth of coverage is below 8. sequana_coverage is"
482
            " not optimised for such depth. You may want to "
483
            " increase the threshold to avoid too many false detections"
484
        )
485
    logger.info(chrom.__str__())
3✔
486

487
    # save summary and metrics
488
    summary = results.get_summary(caller="sequana_coverage")
3✔
489
    summary.to_json(f"{directory}/sequana_summary_coverage.json")
3✔
490

491
    mu = summary.data.get("fit_mu", 0)
3✔
492
    sigma = summary.data.get("fit_sigma", 0)
3✔
493
    pi = summary.data.get("pi", 0)
3✔
494

495
    logger.info(
3✔
496
        "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
497
        % (round(mu, 3), round(sigma, 3), round(pi, 3))
498
    )
499

500
    # some information about the ROIs found
501
    logger.info(
3✔
502
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
503
            chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
504
        )
505
    )
506
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instance
3✔
507
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
3✔
508
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
3✔
509
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))
3✔
510

511
    # Create directory and save ROIs
512
    ROIs.df.to_csv(f"{directory}/rois.csv")
3✔
513

514
    logger.info(f"Creating report in {options.output_directory}. Please wait")
3✔
515

516
    if chrom._mode == "chunks":
3✔
UNCOV
517
        logger.warning("This chromosome is large. Plots in the HTML reports are skipped")
×
518

519
    ChromosomeCoverageModule(
3✔
520
        chrom,
521
        datatable=CoverageModule.init_roi_datatable(ROIs),
522
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
523
        command=" ".join(["sequana_coverage"] + sys.argv[1:]),
524
    )
525

526

527
if __name__ == "__main__":  # pragma: no cover
528
    main()
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc