• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cokelaer / sequana / 7117302237

06 Dec 2023 04:23PM UTC coverage: 75.482% (+1.8%) from 73.729%
7117302237

push

github

cokelaer
Update version

13709 of 18162 relevant lines covered (75.48%)

2.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.52
/sequana/scripts/coverage.py
1
#
2
#  This file is part of Sequana software
3
#
4
#  Copyright (c) 2016-2022 - Sequana Development Team
5
#
6
#  Distributed under the terms of the 3-clause BSD license.
7
#  The full license is in the LICENSE file, distributed with this software.
8
#
9
#  website: https://github.com/sequana/sequana
10
#  documentation: http://sequana.readthedocs.io
11
#
12
##############################################################################
13
""".. rubric:: Standalone application dedicated to coverage"""
3✔
14
import argparse
3✔
15
import gc as garbage
3✔
16
import glob
3✔
17
import os
3✔
18
import subprocess
3✔
19
import sys
3✔
20

21
import colorlog
3✔
22
import rich_click as click
3✔
23
from easydev import AttrDict, shellcmd
3✔
24

25
from sequana import sequana_data
3✔
26
from sequana import version as sequana_version
3✔
27
from sequana.bedtools import ChromosomeCov, SequanaCoverage
3✔
28
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
3✔
29
from sequana.scripts.common import teardown
3✔
30
from sequana.utils import config
3✔
31

32
from .utils import CONTEXT_SETTINGS
3✔
33

34
logger = colorlog.getLogger(__name__)
3✔
35

36

37
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
3✔
38

39

40
click.rich_click.OPTION_GROUPS = {
3✔
41
    "sequana_coverage": [
42
        {
43
            "name": "Required",
44
            "options": ["--input-file"],
45
        },
46
        {
47
            "name": "Detection Algorithm Options",
48
            "options": [
49
                "--circular",
50
                "--mixture-models",
51
                "--window-median",
52
                "--window-gc",
53
                "--low-threshold",
54
                "--high-threshold",
55
                "--threshold",
56
                "--cnv-clustering",
57
                "--clustering-parameter",
58
            ],
59
        },
60
        {
61
            "name": "Modifiers",
62
            "options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
63
        },
64
        {
65
            "name": "Download utilities",
66
            "options": ["--download-reference", "--download-genbank", "--database"],
67
        },
68
        {
69
            "name": "Output files",
70
            "options": ["--no-multiqc", "--output-directory"],
71
        },
72
        {
73
            "name": "Behaviour",
74
            "options": ["--version", "--level", "--debug-level", "--help"],
75
        },
76
    ],
77
}
78

79

80
def download_reference(ctx, param, value):
3✔
81
    if value:
3✔
82
        database = ctx.params.get("database", "ENA")
×
83
        click.echo(f"Downloading reference {value} from {database}...", nl=False)
×
84
        from bioservices.apps import download_fasta as df
×
85

86
        df.download_fasta(value, method="ENA")
×
87
        click.echo("done")
×
88
        sys.exit(0)
×
89
    return value
3✔
90

91

92
def download_genbank(ctx, param, value):
3✔
93
    if value:
3✔
94
        click.echo(f"Downloading genbank {value}...", nl=False)
×
95
        from sequana.snpeff import download_fasta_and_genbank
×
96

97
        download_fasta_and_genbank(value, value, genbank=True, fasta=False)
×
98
        click.echo("done")
×
99
        sys.exit(0)
×
100
    return value
3✔
101

102

103
@click.command(context_settings=CONTEXT_SETTINGS)
3✔
104
@click.option(
3✔
105
    "--database",
106
    "database",
107
    default="ENA",
108
    type=click.Choice(["ENA", "EUtils"]),
109
    is_eager=True,
110
    # expose_value=False,
111
    help="Download the reference from one of these database (default ENA)",
112
)
113
@click.option(
3✔
114
    "--download-reference",
115
    "download_reference",
116
    default=None,
117
    type=click.STRING,
118
    callback=download_reference,
119
    is_eager=True,
120
    # expose_value=False,
121
    help="Provide accession number to download a reference from NCBI/ENA",
122
)
123
@click.option(
3✔
124
    "--download-genbank",
125
    "download_genbank",
126
    default=None,
127
    type=click.STRING,
128
    callback=download_genbank,
129
    is_eager=True,
130
    help="Provide accession number to download annotation from NCBI/ENA",
131
)
132
@click.option(
3✔
133
    "-i",
134
    "--input-file",
135
    "input",
136
    type=click.Path(),
137
    required=True,
138
    help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
139
)
140
@click.option(
3✔
141
    "-a",
142
    "--annotation-file",
143
    "annotation",
144
    type=click.STRING,
145
    default=None,
146
    help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
147
)
148
@click.version_option(sequana_version)
3✔
149
@click.option(
3✔
150
    "-o",
151
    "--circular",
152
    is_flag=True,
153
    show_default=True,
154
    # help="""If the DNA of the organism is circular (typically
155
    #        viruses or bacteria), set to True"""
156
)
157
@click.option(
3✔
158
    "-c",
159
    "--chromosome",
160
    "chromosome",
161
    type=click.STRING,
162
    default="",
163
    help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
164
)
165
@click.option(
3✔
166
    "-k",
167
    "--mixture-models",
168
    "k",
169
    type=click.INT,
170
    help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
171
    default=2,
172
)
173
@click.option(
3✔
174
    "--debug-level",
175
    "logging_level",
176
    default="INFO",
177
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
178
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
179
)
180
@click.option(
3✔
181
    "--level",
182
    "logging_level",
183
    default="INFO",
184
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
185
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
186
)
187
@click.option(
3✔
188
    "-g",
189
    "--window-gc",
190
    "w_gc",
191
    type=click.INT,
192
    default=101,
193
    help="Length of the running window to compute the GC content",
194
)
195
@click.option(
3✔
196
    "--no-multiqc",
197
    "skip_multiqc",
198
    is_flag=True,
199
    show_default=True,
200
    help="Do not create any multiqc HTML page.",
201
)
202
@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")
3✔
203
@click.option(
3✔
204
    "-r",
205
    "--reference-file",
206
    "reference",
207
    type=click.STRING,
208
    default=None,
209
    help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
210
)
211
@click.option(
3✔
212
    "-w",
213
    "--window-median",
214
    "w_median",
215
    type=click.IntRange(200),
216
    help="Length of the running median window (default 20,001, recommended for bacteria).  For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
217
    default=20001,
218
)
219
@click.option(
3✔
220
    "-L",
221
    "--low-threshold",
222
    "low_threshold",
223
    default=-4,
224
    type=click.FLOAT,
225
    help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
226
)
227
@click.option(
3✔
228
    "-H",
229
    "--high-threshold",
230
    "high_threshold",
231
    default=4,
232
    type=click.FLOAT,
233
    help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
234
)
235
@click.option(
3✔
236
    "-T",
237
    "--threshold",
238
    "threshold",
239
    default=4,
240
    type=click.FLOAT,
241
    help="set lower and higher thresholds of the confidence interval. ",
242
)
243
@click.option(
3✔
244
    "-C",
245
    "--clustering-parameter",
246
    "double_threshold",
247
    default=0.5,
248
    type=click.FLOAT,
249
    help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
250
)
251
@click.option(
3✔
252
    "-s",
253
    "--chunk-size",
254
    "chunksize",
255
    type=click.IntRange(1000000),  # 1e6
256
    default=10000000,  # 1e7
257
    help="Length of the chunk to be used for the analysis. ",
258
)
259
@click.option(
3✔
260
    "-B",
261
    "--binning",
262
    "binning",
263
    type=click.IntRange(2),
264
    default=None,
265
    help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
266
)
267
@click.option(
3✔
268
    "--cnv-clustering",
269
    "cnv_clustering",
270
    default=-1,
271
    type=click.INT,
272
    help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
273
)
274
def main(**kwargs):
3✔
275
    """Welcome to SEQUANA -- Coverage standalone
276

277
        ----
278

279
       Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
280
       file. In addition, the running median used in conjunction with double thresholds
281
       extract regions of interests (ROI) for low or high coverage. A reference may be
282
       provided to plot the coverage versus GC content. An annotation file may be provided
283
       to annotate the found ROIs.
284

285
       The input file should be one of the following:
286

287
       - a BED file that is a tabulated file at least 3 columns.
288
         The first column being the reference, the second is the position
289
         and the third column contains the coverage itself.
290
       - or a BAM file that is converted automatically
291
         into a BED file using the following command:
292

293
            bedtools genomecov -d -ibam FILE.bam
294

295
       Note that this is different from another command that could perform the conversion:
296

297
           samtools depth -aa input.bam > output.bed
298

299
       If the reference is provided, an additional plot showing the coverage versus
300
       GC content is also shown.
301

302
       Here are some examples
303

304
           sequana_coverage --input-file file.bed --window-median 1001
305
           sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>
306

307
       An other interesting option is to provide a BED file with 4 columns. The
308
       fourth column being another coverage data created with a filter. One can
309
       create such a file only from the BAM file using samtools as follows given
310
       the original unfiltered BAM file named input.bam:
311

312
           samtools view -q 35  -o data.filtered.bam input.bam
313
           samtools depth input.bam data.filtered.bam  -aa > test.bed
314
           sequana_coverage --input test.bed --show-html
315

316
       Note that the first file is the filtered one, and the second file is the
317
       unfiltered one.
318

319
       Note for multi chromosome and genbank features: for now, you will need to call
320
       sequana_coverage for each chromosome individually since we accept only one
321
       genbank as input parameter:
322

323
           sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1
324

325
       Large genomes:
326
       --------------
327

328
       If your input data is large and does not fit into memory, use the --binning BIN
329
       options to average data into bin of BIN values.
330

331
       CNV cases:
332
       --------------
333

334
       By default, sequana_coverage identify events as small as 1 bin. For the CNV
335
       detection case, you may want to cluster events. the --cnv-merging DELTA option
336
       merges consecutives events whose distance is smaller that DELTA
337

338

339
    ----
340

341
    AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
342
    Documentation: http://sequana.readthedocs.io
343
    Issues: http://github.com/sequana/sequana
344
    """
345
    options = AttrDict(**kwargs)
3✔
346

347
    logger.setLevel(options.logging_level)
3✔
348

349
    if options.annotation:
3✔
350
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation
3✔
351

352
    logger.info(f"Reading {options.input}. This may take time ")
3✔
353

354
    # Convert BAM to BED
355
    if options.input.endswith(".bam"):
3✔
356
        bedfile = options.input.replace(".bam", ".bed")
×
357
        logger.info("Converting BAM into BED file")
×
358
        shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")
×
359
    elif options.input.endswith(".bed"):
3✔
360
        bedfile = options.input
3✔
361
    else:
362
        raise ValueError("Input file must be a BAM or BED file")
×
363

364
    # Set the thresholds
365
    if options.low_threshold is None:
3✔
366
        options.low_threshold = -options.threshold
×
367

368
    if options.high_threshold is None:
3✔
369
        options.high_threshold = options.threshold
×
370

371
    # Now we can create the instance of GenomeCoverage
372
    if "," in options.chromosome:
3✔
373
        chrom_list = options.chromosome.split(",")
×
374
    elif options.chromosome:
3✔
375
        chrom_list = [options.chromosome]
×
376
    else:
377
        chrom_list = []
3✔
378

379
    # initialisation (reading BED to get positions of chromosomes and chromosome names
380
    gc = SequanaCoverage(
3✔
381
        bedfile,
382
        options.annotation,
383
        options.low_threshold,
384
        options.high_threshold,
385
        options.double_threshold,
386
        options.double_threshold,
387
        chunksize=options.chunksize,
388
        chromosome_list=chrom_list,
389
        reference_file=options.reference,
390
        gc_window_size=options.w_gc,
391
    )
392

393
    # some information fo end-users,
394
    logger.info("There are %s chromosomes/contigs." % len(gc))
3✔
395
    for this in gc.chrom_names:
3✔
396
        end = gc.positions[this]["end"]
3✔
397
        start = gc.positions[this]["start"]
3✔
398
        data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
3✔
399
        logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))
3✔
400

401
    # here we read chromosome by chromosome to save memory and analyse them.
402
    N = len(gc)
3✔
403
    for i, chrom in enumerate(gc.chrom_names[::-1]):
3✔
404
        logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")
3✔
405
        # since we read just one contig/chromosome, the chr_list contains
406
        # only one contig, so we access to it with index 0
407

408
        # Performs the computation and reporting for a given chromosome
409
        # This call performs the analysis, and creates the HTML page
410
        # if HTML is creates, it fills the gc._html_list variable
411
        chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
3✔
412
        run_analysis(chrom_data, options)
3✔
413

414
        del chrom_data  # free memory for sure
3✔
415
        garbage.collect()
3✔
416

417
        # logging level seems to be reset to warning somewhere
418
        logger.setLevel(options.logging_level)
3✔
419

420
    CoverageModule(gc)
3✔
421

422
    if options.skip_multiqc is False:
3✔
423
        logger.info("Creating multiqc report")
3✔
424
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
3✔
425
        cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "
3✔
426

427
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
3✔
428
        proc.wait()
3✔
429

430
    teardown(options.output_directory)
3✔
431

432

433
def run_analysis(chrom, options):
3✔
434
    logger.info("Computing some metrics")
3✔
435

436
    if options.w_median > len(chrom) / 4:
3✔
437
        NW = int(len(chrom) / 4)
×
438
        if NW % 2 == 0:
×
439
            NW += 1
×
440
        logger.warning(
×
441
            "median window length is too long. \n"
442
            "    Setting the window length automatically to a fifth of\n"
443
            "    the chromosome length ({})".format(NW)
444
        )
445
    else:
446
        NW = options.w_median
3✔
447

448
    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
449
    config.output_dir = options.output_directory
3✔
450
    config.sample_name = os.path.basename(options.input).split(".")[0]
3✔
451

452
    directory = options.output_directory
3✔
453
    os.makedirs(directory, exist_ok=True)
3✔
454
    directory = f"{options.output_directory}/{chrom.chrom_name}"
3✔
455
    os.makedirs(directory, exist_ok=True)
3✔
456
    #########################################################################
457

458
    # compute the running median, zscore and ROIs for each chunk summarizing the
459
    # results in a ChromosomeCovMultiChunk instane
460
    logger.info("Using running median (w=%s)" % NW)
3✔
461
    logger.info("Number of mixture models %s " % options.k)
3✔
462
    results = chrom.run(
3✔
463
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
464
    )
465
    chrom.plot_coverage(f"{directory}/coverage.png")
3✔
466

467
    if chrom.DOC < 8:
3✔
468
        logger.warning(
×
469
            "The depth of coverage is below 8. sequana_coverage is"
470
            " not optimised for such depth. You may want to "
471
            " increase the threshold to avoid too many false detections"
472
        )
473
    logger.info(chrom.__str__())
3✔
474

475
    # save summary and metrics
476
    summary = results.get_summary(caller="sequana_coverage")
3✔
477
    summary.to_json(f"{directory}/sequana_summary_coverage.json")
3✔
478

479
    mu = summary.data.get("fit_mu", 0)
3✔
480
    sigma = summary.data.get("fit_sigma", 0)
3✔
481
    pi = summary.data.get("pi", 0)
3✔
482

483
    logger.info(
3✔
484
        "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
485
        % (round(mu, 3), round(sigma, 3), round(pi, 3))
486
    )
487

488
    # some information about the ROIs found
489
    logger.info(
3✔
490
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
491
            chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
492
        )
493
    )
494
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instance
3✔
495
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
3✔
496
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
3✔
497
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))
3✔
498

499
    # Create directory and save ROIs
500
    ROIs.df.to_csv(f"{directory}/rois.csv")
3✔
501

502
    logger.info(f"Creating report in {options.output_directory}. Please wait")
3✔
503

504
    if chrom._mode == "chunks":
3✔
505
        logger.warning("This chromosome is large. Plots in the HTML reports are skipped")
×
506

507
    ChromosomeCoverageModule(
3✔
508
        chrom,
509
        datatable=CoverageModule.init_roi_datatable(ROIs),
510
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
511
        command=" ".join(["sequana_coverage"] + sys.argv[1:]),
512
    )
513

514

515
if __name__ == "__main__":  # pragma: no cover
516
    main()
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc