• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cokelaer / sequana / 6630101806

24 Oct 2023 04:59PM UTC coverage: 74.055% (-0.004%) from 74.059%
6630101806

push

github

cokelaer
Memory efficient fixup for sequana coverage

30 of 30 new or added lines in 2 files covered. (100.0%)

13250 of 17892 relevant lines covered (74.06%)

2.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.56
/sequana/scripts/coverage.py
1
#
2
#  This file is part of Sequana software
3
#
4
#  Copyright (c) 2016-2022 - Sequana Development Team
5
#
6
#  Distributed under the terms of the 3-clause BSD license.
7
#  The full license is in the LICENSE file, distributed with this software.
8
#
9
#  website: https://github.com/sequana/sequana
10
#  documentation: http://sequana.readthedocs.io
11
#
12
##############################################################################
13
""".. rubric:: Standalone application dedicated to coverage"""
3✔
14
import argparse
3✔
15
import os
3✔
16
import sys
3✔
17

18
import colorlog
3✔
19
from easydev import mkdirs, shellcmd
3✔
20
from easydev.console import purple
3✔
21

22
from sequana import sequana_data
3✔
23
from sequana.bedtools import GenomeCov, ChromosomeCov
3✔
24
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
3✔
25
from sequana.utils import config
3✔
26

27
logger = colorlog.getLogger(__name__)
3✔
28

29

30
# http://stackoverflow.com/questions/18462610/argumentparser-epilog-and-description-formatting-in-conjunction-with-argumentdef
31
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
3✔
32
    pass
3✔
33

34

35
class Min(argparse.Action):
3✔
36
    def __init__(self, min=None, *args, **kwargs):
3✔
37
        self.min = min
3✔
38
        kwargs["metavar"] = "[minimum=%d]" % (self.min)
3✔
39
        super(Min, self).__init__(*args, **kwargs)
3✔
40

41
    def __call__(self, parser, namespace, value, option_string=None):
3✔
42
        if not (self.min <= value):
×
43
            msg = "invalid choice: %r (choose value above %d)" % (value, self.min)
×
44
            raise argparse.ArgumentError(self, msg)
×
45
        setattr(namespace, self.dest, value)
×
46

47

48
class Options(argparse.ArgumentParser):
3✔
49
    def __init__(self, prog="sequana_coverage"):
3✔
50
        usage = purple(
3✔
51
            """\nWelcome to SEQUANA -- Coverage standalone
52

53
    Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM
54
    file. In addition, running median used in conjunction with double thresholds
55
    extract regions of interests (low or high coverage). A reference may be
56
    provided to plot the coverage versus GC content.
57

58
    The input file should be one of the following:
59

60
    - a BED file that is a tabulated file at least 3 columns.
61
      The first column being the reference, the second is the position
62
      and the third column contains the coverage itself.
63
    - or a BAM file that is converted automatically
64
      into a BED file using the following command:
65

66
        samtools depth -aa input.bam > output.bed
67

68
    If the reference is provided, an additional plot showing the coverage versus
69
    GC content is also shown.
70

71
    Here are some examples
72

73
        sequana_coverage --input file.bed --window-median 1001
74
        sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa>
75

76
    An other interesting option is to provide a BED file with 4 columns. The
77
    fourth column being another coverage data created with a filter. One can
78
    create such a file only from the BAM file using samtools as follows given
79
    the original unfiltered BAM file named input.bam:
80

81
        samtools view -q 35  -o data.filtered.bam input.bam
82
        samtools depth input.bam data.filtered.bam  -aa > test.bed
83
        sequana_coverage --input test.bed --show-html
84

85
    Note that the first file is the filtered one, and the second file is the
86
    unfiltered one.
87

88
    Note for multi chromosome and genbank features: for now, you will need to call
89
    sequana_coverage for each chromosome individually since we accept only one
90
    genbank as input parameter:
91

92
        sequana_coverage --input file.bed --genbank chrom1.gbk -c 1
93

94
    Large genomes:
95
    --------------
96

97
    If your input data is large and does not fit into memory, use the --binning BIN
98
    options to average data into bin of BIN values.
99

100
    CNV cases:
101
    --------------
102

103
    By default, sequana_coverage identify events as small as 1 bin. For the CNV
104
    detection case, you may want to cluster events. the --cnv-merging DELTA option
105
    merges consecutives events whose distance is smaller that DELTA
106

107

108
        """
109
        )
110

111
        epilog = purple(
3✔
112
            """
113
----
114

115
AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
116
Documentation: http://sequana.readthedocs.io
117
Issues: http://github.com/sequana/sequana
118
        """
119
        )
120

121
        description = """DESCRIPTION:
3✔
122
        """
123

124
        super(Options, self).__init__(
3✔
125
            usage=usage, prog=prog, description=description, epilog=epilog, formatter_class=CustomFormatter
126
        )
127

128
        # options to fill the config file
129
        group = self.add_argument_group("Required argument")
3✔
130
        group.add_argument(
3✔
131
            "-i",
132
            "--input",
133
            dest="input",
134
            type=str,
135
            help=(
136
                "Input file in BED or BAM format. If a BAM file is "
137
                "provided, it will be converted locally to a BED file "
138
                "using genomecov, which must be installed."
139
            ),
140
        )
141

142
        group = self.add_argument_group("Optional biological arguments")
3✔
143
        group.add_argument(
3✔
144
            "-c",
145
            "--chromosome",
146
            dest="chromosome",
147
            type=int,
148
            default=-1,
149
            help="Chromosome number (if only one chromosome found, the single"
150
            " chromosome is chosen automatically). Otherwise all "
151
            "chromosomes are analysed. You may want to analyse only one"
152
            " in which case, use this parameter (e.g., -c 1). "
153
            "!!START AT INDEX 0 !!",
154
        )
155
        group.add_argument(
3✔
156
            "-o",
157
            "--circular",
158
            dest="circular",
159
            default=False,
160
            action="store_true",
161
            help="""If the DNA of the organism is circular (typically
162
            viruses or bacteria), set to True""",
163
        )
164

165
        group = self.add_argument_group("General")
3✔
166
        group.add_argument(
3✔
167
            "--output-directory",
168
            dest="output_directory",
169
            default="report",
170
            help="name of the output (report) directory.",
171
        )
172
        group.add_argument("-q", "--quiet", dest="verbose", default=True, action="store_false")
3✔
173
        group.add_argument(
3✔
174
            "--no-html",
175
            dest="skip_html",
176
            default=False,
177
            action="store_true",
178
            help="""Do not create any HTML reports. Save ROIs and statistics only.""",
179
        )
180
        group.add_argument(
3✔
181
            "--no-multiqc",
182
            dest="skip_multiqc",
183
            default=False,
184
            action="store_true",
185
            help="""Do not create any multiqc HTML page.""",
186
        )
187
        group.add_argument(
3✔
188
            "--debug-level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR"
189
        )
190
        group.add_argument(
3✔
191
            "--level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR"
192
        )
193
        group = self.add_argument_group("Annotation")
3✔
194
        group.add_argument(
3✔
195
            "-a", "--annotation", dest="annotation", type=str, default=None, help="a valid gff3 or genbank annotation"
196
        )
197

198
        # Group related to GC content
199
        group = self.add_argument_group("GC content related")
3✔
200
        group.add_argument(
3✔
201
            "-r",
202
            "--reference",
203
            dest="reference",
204
            type=str,
205
            default=None,
206
            help="""If available, you can provide a reference (ENA/NCBI). It
207
                 must have the same length as the one used to create the
208
                 BAM or BED file. If provided, it is used to create the
209
                 coverage versus GC content image""",
210
        )
211
        group.add_argument(
3✔
212
            "-g",
213
            "--window-gc",
214
            dest="w_gc",
215
            type=int,
216
            default=201,
217
            help="""Length of the running window to compute the GC content""",
218
        )
219
        group.add_argument(
3✔
220
            "-n", "--nlevels", dest="levels", type=int, default=3, help="""Number of levels in the contour"""
221
        )
222

223
        # group running median
224
        group = self.add_argument_group("Running Median and clustering related")
3✔
225
        group.add_argument(
3✔
226
            "-w",
227
            "--window-median",
228
            dest="w_median",
229
            type=int,
230
            help="""Length of the running median window (default 20001,
231
                recommended for bacteria).  For short genome (below 100000
232
                bases), we set this parameter to one fifth of the genome
233
                length.""",
234
            default=20001,
235
        )
236

237
        group.add_argument(
3✔
238
            "-k",
239
            "--mixture-models",
240
            dest="k",
241
            type=int,
242
            help="""Number of mixture models to use (default 2, although if sequencing
243
        depth is below 8, k is set to 1 automatically). To ignore that behaviour
244
        set k to the required value""",
245
            default=2,
246
        )
247

248
        group.add_argument(
3✔
249
            "-L",
250
            "--low-threshold",
251
            dest="low_threshold",
252
            default=None,
253
            type=float,
254
            help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
255
        )
256
        group.add_argument(
3✔
257
            "-H",
258
            "--high-threshold",
259
            dest="high_threshold",
260
            default=None,
261
            type=float,
262
            help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
263
        )
264
        group.add_argument(
3✔
265
            "-T",
266
            "--threshold",
267
            dest="threshold",
268
            default=4,
269
            type=float,
270
            help="""set lower and higher thresholds of the confidence interval.""",
271
        )
272
        group.add_argument(
3✔
273
            "-C",
274
            "--clustering-parameter",
275
            dest="double_threshold",
276
            default=0.5,
277
            type=float,
278
            help="""set lower and higher double threshold parameter (in [0,1]).
279
Do not use value close to zero. Ideally, around 0.5. lower value will tend to
280
cluster more than higher value""",
281
        )
282

283
        group = self.add_argument_group("Large data related - CNV detection")
3✔
284
        group.add_argument(
3✔
285
            "-s",
286
            "--chunk-size",
287
            dest="chunksize",
288
            type=int,
289
            default=10000000,
290
            min=1000000,
291
            action=Min,
292
            help="""Length of the chunk to be used for the analysis. """,
293
        )
294
        group.add_argument(
3✔
295
            "-B",
296
            "--binning",
297
            dest="binning",
298
            type=int,
299
            default=None,
300
            min=2,
301
            action=Min,
302
            help="""merge consecutive (non overlapping) data points, taking the
303
mean. This is useful for large genome (e.g. human). This allows a faster
304
computation, especially for CNV detection were only large windows are of
305
interest. For instance, using a binning of 50 or 100 allows the human genome to
306
be analysed.""",
307
        )
308
        group.add_argument(
3✔
309
            "--cnv-clustering",
310
            dest="cnv_clustering",
311
            default=-1,
312
            type=int,
313
            help="""Two consecutive ROIs are merged when their distance in bases
314
is below this parameter. If set to -1, not used. """,
315
        )
316

317
        # group facilities
318
        group = self.add_argument_group("Download reference")
3✔
319
        group.add_argument("--download-reference", dest="download_reference", default=None, type=str)
3✔
320
        group.add_argument("--download-genbank", dest="download_genbank", default=None, type=str)
3✔
321
        group.add_argument(
3✔
322
            "--database",
323
            dest="database",
324
            default="ENA",
325
            type=str,
326
            choices=["ENA", "EUtils"],
327
            help="Download the reference from one of these database (default ENA)",
328
        )
329

330

331
def main(args=None):
3✔
332
    if args is None:
3✔
333
        args = sys.argv[:]
×
334

335
    user_options = Options(prog="sequana")
3✔
336

337
    # If --help or no options provided, show the help
338
    if len(args) == 1:
3✔
339
        user_options.parse_args(["prog", "--help"])
×
340
    else:
341
        options = user_options.parse_args(args[1:])
3✔
342

343
    logger.setLevel(options.logging_level)
3✔
344

345
    if options.download_reference:
3✔
346
        logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database))
3✔
347

348
        from bioservices.apps import download_fasta as df
3✔
349

350
        df.download_fasta(options.download_reference, method=options.database)
3✔
351
        if options.download_genbank is None:
3✔
352
            return
3✔
353

354
    if options.download_genbank:
3✔
355
        logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database))
3✔
356
        from sequana.snpeff import download_fasta_and_genbank
3✔
357

358
        download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False)
3✔
359
        return
3✔
360

361
    if options.annotation:
3✔
362
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation
3✔
363

364
    logger.info(f"Reading {options.input}. This may take time ")
3✔
365

366
    # Convert BAM to BED
367
    if options.input.endswith(".bam"):
3✔
368
        bedfile = options.input.replace(".bam", ".bed")
×
369
        logger.info("Converting BAM into BED file")
×
370
        shellcmd(f"bedtools genomecov -d -ibam {option.input} > {bedfile}")
×
371
    elif options.input.endswith(".bed"):
3✔
372
        bedfile = options.input
3✔
373
    else:
374
        raise ValueError("Input file must be a BAM or BED file")
×
375

376
    # Set the thresholds
377
    if options.low_threshold is None:
3✔
378
        options.low_threshold = -options.threshold
3✔
379

380
    if options.high_threshold is None:
3✔
381
        options.high_threshold = options.threshold
3✔
382

383
    # Now we can create the instance of GenomeCoverage
384
    if options.chromosome == -1:
3✔
385
        chrom_list = []
3✔
386
    else:
387
        chrom_list = [options.chromosome]
×
388

389
    #
390
    gc = GenomeCov(
3✔
391
        bedfile,
392
        options.annotation,
393
        options.low_threshold,
394
        options.high_threshold,
395
        options.double_threshold,
396
        options.double_threshold,
397
        chunksize=options.chunksize,
398
        chromosome_list=chrom_list,
399
    )
400

401
    # if we have the reference, let us use it
402
    if options.reference:
3✔
403
        logger.info("Computing GC content")
3✔
404
        gc.compute_gc_content(options.reference, options.w_gc, options.circular)
3✔
405

406
    # Now we scan the chromosomes,
407
    if len(gc.chrom_names) == 1:
3✔
408
        logger.warning("There is only one chromosome. Selected automatically.")
3✔
409
        run_analysis(gc.chr_list[0], options)
3✔
410
    elif options.chromosome < -1 or options.chromosome > len(gc.chrom_names):
×
411
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
×
412
        logger.error(msg)
×
413
        sys.exit(1)
×
414
    else:
415
        if options.chromosome == -1:
×
416
            chromosomes = gc.chrom_names  # take all chromosomes
×
417
        else:
418
            # For user, we start at position 1 but in python, we start at zero
419
            chromosomes = [gc.chrom_names[options.chromosome - 1]]
×
420

421
        logger.info("There are %s chromosomes/contigs." % len(gc))
×
422
        for this in gc.chrom_names:
×
423
            end = gc.positions[this]["end"]
×
424
            start = gc.positions[this]["start"]
×
425
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
×
426
            logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))
×
427

428
        # here we read chromosome by chromosome to save memory.
429
        for i, chrom in enumerate(chromosomes):
×
430
            logger.info("==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i]))
×
431
            # since we read just one contig/chromosome, the chr_list contains
432
            # only one contig, so we access to it with index 0
433

434
            chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
×
435

436
            run_analysis(chrom_data, options)
×
437
            # logging level seems to be reset to warning somewhere
438
            logger.setLevel(options.logging_level)
×
439

440
    if options.skip_multiqc is False:
3✔
441
        logger.info("Creating multiqc report")
3✔
442
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
3✔
443
        cmd = "multiqc . -m sequana_coverage -f -c {} ".format(pathtocfg)
3✔
444
        import subprocess
3✔
445

446
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
3✔
447
        proc.wait()
3✔
448
        #    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
449
        # out, err = proc.communicate()
450
        # with open("multiqc.log", "w") as fout:
451
        #    fout.write(err.decode())
452
    logger.info("Done")
3✔
453

454

455
def run_analysis(chrom, options):
3✔
456
    logger.info("Computing some metrics")
3✔
457
    if chrom.DOC < 8:
3✔
458
        logger.warning(
×
459
            "The depth of coverage is below 8. sequana_coverage is"
460
            " not optimised for such depth. You may want to "
461
            " increase the threshold to avoid too many false detections"
462
        )
463
    logger.info(chrom.__str__())
3✔
464

465
    if options.w_median > len(chrom.df) / 4:
3✔
466
        NW = int(len(chrom.df) / 4)
×
467
        if NW % 2 == 0:
×
468
            NW += 1
×
469
        logger.warning(
×
470
            "median window length is too long. \n"
471
            "    Setting the window length automatically to a fifth of\n"
472
            "    the chromosome length ({})".format(NW)
473
        )
474
    else:
475
        NW = options.w_median
3✔
476

477
    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
478
    config.output_dir = options.output_directory
3✔
479
    config.sample_name = os.path.basename(options.input).split(".")[0]
3✔
480
    #########################################################################
481

482
    # compute the running median, zscore and ROIs for each chunk summarizing the
483
    # results in a ChromosomeCovMultiChunk instane
484
    logger.info("Using running median (w=%s)" % NW)
3✔
485
    logger.info("Number of mixture models %s " % options.k)
3✔
486
    results = chrom.run(
3✔
487
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
488
    )
489

490
    # Print some info related to the fitted mixture models
491
    try:
3✔
492
        mu = results.data[0][0].as_dict()["data"]["fit_mu"]
3✔
493
        sigma = results.data[0][0].as_dict()["data"]["fit_sigma"]
3✔
494
        pi = results.data[0][0].as_dict()["data"]["fit_pi"]
3✔
495
        logger.info(
3✔
496
            "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
497
            % (round(mu, 3), round(sigma, 3), round(pi, 3))
498
        )
499
    except:
×
500
        pass
×
501

502
    # some information about the ROIs found
503
    high = chrom.thresholds.high2
3✔
504
    low = chrom.thresholds.low2
3✔
505
    logger.info(
3✔
506
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
507
            chrom.thresholds.low, chrom.thresholds.high, low, high
508
        )
509
    )
510
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instane
3✔
511
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
3✔
512
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
3✔
513
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))
3✔
514

515
    # Create directory and save ROIs
516
    directory = options.output_directory
3✔
517

518
    directory = "{}/{}".format(options.output_directory, chrom.chrom_name)
3✔
519
    mkdirs(directory)
3✔
520
    ROIs.df.to_csv("{}/rois.csv".format(directory))
3✔
521

522
    # save summary and metrics
523
    logger.info("Computing extra metrics")
3✔
524
    summary = results.get_summary(caller="sequana_coverage")
3✔
525

526
    summary.to_json("{}/sequana_summary_coverage.json".format(directory))
3✔
527
    logger.info("Evenness: {}".format(summary.data["evenness"]))
3✔
528
    logger.info("Centralness (3 sigma): {}".format(summary.data["C3"]))
3✔
529
    logger.info("Centralness (4 sigma): {}".format(summary.data["C4"]))
3✔
530

531
    if options.skip_html:
3✔
532
        return
3✔
533

534
    chrom.plot_coverage("{}/coverage.png".format(directory))
3✔
535
    logger.info("Creating report in %s. Please wait" % options.output_directory)
3✔
536

537
    if chrom._mode == "chunks":
3✔
538
        logger.warning(("This chromosome is large. " "Plots in the HTML reports are skipped"))
×
539

540
    datatable = CoverageModule.init_roi_datatable(ROIs)
3✔
541

542
    # sample name not important for the standalone
543
    config.sample_name = "subreports"
3✔
544

545
    ChromosomeCoverageModule(
3✔
546
        chrom,
547
        datatable,
548
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
549
        command=" ".join(["sequana_coverage"] + sys.argv[1:]),
550
    )
551
    # directory=options.output_directory)
552

553

554
if __name__ == "__main__":  # pragma: no cover
555
    main()
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc