6630101806

Committed 24 Oct 2023 04:59PM UTC coverage: 74.055% (-0.004%) from 74.059%

Build # 6630101806

Build Type

push

github

Committed by

cokelaer

Commit Message

Memory efficient fixup for sequana coverage

Run Details

30 of 30 new or added lines in 2 files covered. (100.0%)

13250 of 17892 relevant lines covered (74.06%)

2.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.56

/sequana/scripts/coverage.py

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
""".. rubric:: Standalone application dedicated to coverage"""
import argparse
import os
import sys

import colorlog
from easydev import mkdirs, shellcmd
from easydev.console import purple

from sequana import sequana_data
from sequana.bedtools import GenomeCov, ChromosomeCov
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
from sequana.utils import config

logger = colorlog.getLogger(__name__)


# http://stackoverflow.com/questions/18462610/argumentparser-epilog-and-description-formatting-in-conjunction-with-argumentdef
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
    pass


class Min(argparse.Action):
    def __init__(self, min=None, *args, **kwargs):
        self.min = min
        kwargs["metavar"] = "[minimum=%d]" % (self.min)
        super(Min, self).__init__(*args, **kwargs)

    def __call__(self, parser, namespace, value, option_string=None):
        if not (self.min <= value):
            msg = "invalid choice: %r (choose value above %d)" % (value, self.min)
            raise argparse.ArgumentError(self, msg)
        setattr(namespace, self.dest, value)


class Options(argparse.ArgumentParser):
    def __init__(self, prog="sequana_coverage"):
        usage = purple(
            """\nWelcome to SEQUANA -- Coverage standalone

    Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM
    file. In addition, running median used in conjunction with double thresholds
    extract regions of interests (low or high coverage). A reference may be
    provided to plot the coverage versus GC content.

    The input file should be one of the following:

    - a BED file that is a tabulated file at least 3 columns.
      The first column being the reference, the second is the position
      and the third column contains the coverage itself.
    - or a BAM file that is converted automatically
      into a BED file using the following command:

        samtools depth -aa input.bam > output.bed

    If the reference is provided, an additional plot showing the coverage versus
    GC content is also shown.

    Here are some examples

        sequana_coverage --input file.bed --window-median 1001
        sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa>

    An other interesting option is to provide a BED file with 4 columns. The
    fourth column being another coverage data created with a filter. One can
    create such a file only from the BAM file using samtools as follows given
    the original unfiltered BAM file named input.bam:

        samtools view -q 35  -o data.filtered.bam input.bam
        samtools depth input.bam data.filtered.bam  -aa > test.bed
        sequana_coverage --input test.bed --show-html

    Note that the first file is the filtered one, and the second file is the
    unfiltered one.

    Note for multi chromosome and genbank features: for now, you will need to call
    sequana_coverage for each chromosome individually since we accept only one
    genbank as input parameter:

        sequana_coverage --input file.bed --genbank chrom1.gbk -c 1

    Large genomes:
    --------------

    If your input data is large and does not fit into memory, use the --binning BIN
    options to average data into bin of BIN values.

    CNV cases:
    --------------

    By default, sequana_coverage identify events as small as 1 bin. For the CNV
    detection case, you may want to cluster events. the --cnv-merging DELTA option
    merges consecutives events whose distance is smaller that DELTA


        """
        )

        epilog = purple(
            """
----

AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
Documentation: http://sequana.readthedocs.io
Issues: http://github.com/sequana/sequana
        """
        )

        description = """DESCRIPTION:
        """

        super(Options, self).__init__(
            usage=usage, prog=prog, description=description, epilog=epilog, formatter_class=CustomFormatter
        )

        # options to fill the config file
        group = self.add_argument_group("Required argument")
        group.add_argument(
            "-i",
            "--input",
            dest="input",
            type=str,
            help=(
                "Input file in BED or BAM format. If a BAM file is "
                "provided, it will be converted locally to a BED file "
                "using genomecov, which must be installed."
            ),
        )

        group = self.add_argument_group("Optional biological arguments")
        group.add_argument(
            "-c",
            "--chromosome",
            dest="chromosome",
            type=int,
            default=-1,
            help="Chromosome number (if only one chromosome found, the single"
            " chromosome is chosen automatically). Otherwise all "
            "chromosomes are analysed. You may want to analyse only one"
            " in which case, use this parameter (e.g., -c 1). "
            "!!START AT INDEX 0 !!",
        )
        group.add_argument(
            "-o",
            "--circular",
            dest="circular",
            default=False,
            action="store_true",
            help="""If the DNA of the organism is circular (typically
            viruses or bacteria), set to True""",
        )

        group = self.add_argument_group("General")
        group.add_argument(
            "--output-directory",
            dest="output_directory",
            default="report",
            help="name of the output (report) directory.",
        )
        group.add_argument("-q", "--quiet", dest="verbose", default=True, action="store_false")
        group.add_argument(
            "--no-html",
            dest="skip_html",
            default=False,
            action="store_true",
            help="""Do not create any HTML reports. Save ROIs and statistics only.""",
        )
        group.add_argument(
            "--no-multiqc",
            dest="skip_multiqc",
            default=False,
            action="store_true",
            help="""Do not create any multiqc HTML page.""",
        )
        group.add_argument(
            "--debug-level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR"
        )
        group.add_argument(
            "--level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR"
        )
        group = self.add_argument_group("Annotation")
        group.add_argument(
            "-a", "--annotation", dest="annotation", type=str, default=None, help="a valid gff3 or genbank annotation"
        )

        # Group related to GC content
        group = self.add_argument_group("GC content related")
        group.add_argument(
            "-r",
            "--reference",
            dest="reference",
            type=str,
            default=None,
            help="""If available, you can provide a reference (ENA/NCBI). It
                 must have the same length as the one used to create the
                 BAM or BED file. If provided, it is used to create the
                 coverage versus GC content image""",
        )
        group.add_argument(
            "-g",
            "--window-gc",
            dest="w_gc",
            type=int,
            default=201,
            help="""Length of the running window to compute the GC content""",
        )
        group.add_argument(
            "-n", "--nlevels", dest="levels", type=int, default=3, help="""Number of levels in the contour"""
        )

        # group running median
        group = self.add_argument_group("Running Median and clustering related")
        group.add_argument(
            "-w",
            "--window-median",
            dest="w_median",
            type=int,
            help="""Length of the running median window (default 20001,
                recommended for bacteria).  For short genome (below 100000
                bases), we set this parameter to one fifth of the genome
                length.""",
            default=20001,
        )

        group.add_argument(
            "-k",
            "--mixture-models",
            dest="k",
            type=int,
            help="""Number of mixture models to use (default 2, although if sequencing
        depth is below 8, k is set to 1 automatically). To ignore that behaviour
        set k to the required value""",
            default=2,
        )

        group.add_argument(
            "-L",
            "--low-threshold",
            dest="low_threshold",
            default=None,
            type=float,
            help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
        )
        group.add_argument(
            "-H",
            "--high-threshold",
            dest="high_threshold",
            default=None,
            type=float,
            help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
        )
        group.add_argument(
            "-T",
            "--threshold",
            dest="threshold",
            default=4,
            type=float,
            help="""set lower and higher thresholds of the confidence interval.""",
        )
        group.add_argument(
            "-C",
            "--clustering-parameter",
            dest="double_threshold",
            default=0.5,
            type=float,
            help="""set lower and higher double threshold parameter (in [0,1]).
Do not use value close to zero. Ideally, around 0.5. lower value will tend to
cluster more than higher value""",
        )

        group = self.add_argument_group("Large data related - CNV detection")
        group.add_argument(
            "-s",
            "--chunk-size",
            dest="chunksize",
            type=int,
            default=10000000,
            min=1000000,
            action=Min,
            help="""Length of the chunk to be used for the analysis. """,
        )
        group.add_argument(
            "-B",
            "--binning",
            dest="binning",
            type=int,
            default=None,
            min=2,
            action=Min,
            help="""merge consecutive (non overlapping) data points, taking the
mean. This is useful for large genome (e.g. human). This allows a faster
computation, especially for CNV detection were only large windows are of
interest. For instance, using a binning of 50 or 100 allows the human genome to
be analysed.""",
        )
        group.add_argument(
            "--cnv-clustering",
            dest="cnv_clustering",
            default=-1,
            type=int,
            help="""Two consecutive ROIs are merged when their distance in bases
is below this parameter. If set to -1, not used. """,
        )

        # group facilities
        group = self.add_argument_group("Download reference")
        group.add_argument("--download-reference", dest="download_reference", default=None, type=str)
        group.add_argument("--download-genbank", dest="download_genbank", default=None, type=str)
        group.add_argument(
            "--database",
            dest="database",
            default="ENA",
            type=str,
            choices=["ENA", "EUtils"],
            help="Download the reference from one of these database (default ENA)",
        )


def main(args=None):
    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.setLevel(options.logging_level)

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df

        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank

        download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False)
        return

    if options.annotation:
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation

    logger.info(f"Reading {options.input}. This may take time ")

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd(f"bedtools genomecov -d -ibam {option.input} > {bedfile}")
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else:
        chrom_list = [options.chromosome]

    #
    gc = GenomeCov(
        bedfile,
        options.annotation,
        options.low_threshold,
        options.high_threshold,
        options.double_threshold,
        options.double_threshold,
        chunksize=options.chunksize,
        chromosome_list=chrom_list,
    )

    # if we have the reference, let us use it
    if options.reference:
        logger.info("Computing GC content")
        gc.compute_gc_content(options.reference, options.w_gc, options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options)
    elif options.chromosome < -1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names  # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome - 1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            end = gc.positions[this]["end"]
            start = gc.positions[this]["start"]
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
            logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        for i, chrom in enumerate(chromosomes):
            logger.info("==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0

            chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)

            run_analysis(chrom_data, options)
            # logging level seems to be reset to warning somewhere
            logger.setLevel(options.logging_level)

    if options.skip_multiqc is False:
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = "multiqc . -m sequana_coverage -f -c {} ".format(pathtocfg)
        import subprocess

        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
        #    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        # out, err = proc.communicate()
        # with open("multiqc.log", "w") as fout:
        #    fout.write(err.decode())
    logger.info("Done")


def run_analysis(chrom, options):
    logger.info("Computing some metrics")
    if chrom.DOC < 8:
        logger.warning(
            "The depth of coverage is below 8. sequana_coverage is"
            " not optimised for such depth. You may want to "
            " increase the threshold to avoid too many false detections"
        )
    logger.info(chrom.__str__())

    if options.w_median > len(chrom.df) / 4:
        NW = int(len(chrom.df) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning(
            "median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW)
        )
    else:
        NW = options.w_median

    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split(".")[0]
    #########################################################################

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info("Using running median (w=%s)" % NW)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
    )

    # Print some info related to the fitted mixture models
    try:
        mu = results.data[0][0].as_dict()["data"]["fit_mu"]
        sigma = results.data[0][0].as_dict()["data"]["fit_sigma"]
        pi = results.data[0][0].as_dict()["data"]["fit_pi"]
        logger.info(
            "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
            % (round(mu, 3), round(sigma, 3), round(pi, 3))
        )
    except:
        pass

    # some information about the ROIs found
    high = chrom.thresholds.high2
    low = chrom.thresholds.low2
    logger.info(
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
            chrom.thresholds.low, chrom.thresholds.high, low, high
        )
    )
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instane
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    directory = options.output_directory

    directory = "{}/{}".format(options.output_directory, chrom.chrom_name)
    mkdirs(directory)
    ROIs.df.to_csv("{}/rois.csv".format(directory))

    # save summary and metrics
    logger.info("Computing extra metrics")
    summary = results.get_summary(caller="sequana_coverage")

    summary.to_json("{}/sequana_summary_coverage.json".format(directory))
    logger.info("Evenness: {}".format(summary.data["evenness"]))
    logger.info("Centralness (3 sigma): {}".format(summary.data["C3"]))
    logger.info("Centralness (4 sigma): {}".format(summary.data["C4"]))

    if options.skip_html:
        return

    chrom.plot_coverage("{}/coverage.png".format(directory))
    logger.info("Creating report in %s. Please wait" % options.output_directory)

    if chrom._mode == "chunks":
        logger.warning(("This chromosome is large. " "Plots in the HTML reports are skipped"))

    datatable = CoverageModule.init_roi_datatable(ROIs)

    # sample name not important for the standalone
    config.sample_name = "subreports"

    ChromosomeCoverageModule(
        chrom,
        datatable,
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
        command=" ".join(["sequana_coverage"] + sys.argv[1:]),
    )
    # directory=options.output_directory)


if __name__ == "__main__":  # pragma: no cover
    main()

1	#
2	# This file is part of Sequana software
3	#
4	# Copyright (c) 2016-2022 - Sequana Development Team
5	#
6	# Distributed under the terms of the 3-clause BSD license.
7	# The full license is in the LICENSE file, distributed with this software.
8	#
9	# website: https://github.com/sequana/sequana
10	# documentation: http://sequana.readthedocs.io
11	#
12	##############################################################################
13	""".. rubric:: Standalone application dedicated to coverage"""	3✔
14	import argparse	3✔
15	import os	3✔
16	import sys	3✔
17
18	import colorlog	3✔
19	from easydev import mkdirs, shellcmd	3✔
20	from easydev.console import purple	3✔
21
22	from sequana import sequana_data	3✔
23	from sequana.bedtools import GenomeCov, ChromosomeCov	3✔
24	from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule	3✔
25	from sequana.utils import config	3✔
26
27	logger = colorlog.getLogger(__name__)	3✔
28
29
30	# http://stackoverflow.com/questions/18462610/argumentparser-epilog-and-description-formatting-in-conjunction-with-argumentdef
31	class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):	3✔
32	pass	3✔
33
34
35	class Min(argparse.Action):	3✔
36	def __init__(self, min=None, args, *kwargs):	3✔
37	self.min = min	3✔
38	kwargs["metavar"] = "[minimum=%d]" % (self.min)	3✔
39	super(Min, self).__init__(args, *kwargs)	3✔
40
41	def __call__(self, parser, namespace, value, option_string=None):	3✔
42	if not (self.min <= value):	×
43	msg = "invalid choice: %r (choose value above %d)" % (value, self.min)	×
44	raise argparse.ArgumentError(self, msg)	×
45	setattr(namespace, self.dest, value)	×
46
47
48	class Options(argparse.ArgumentParser):	3✔
49	def __init__(self, prog="sequana_coverage"):	3✔
50	usage = purple(	3✔
51	"""\nWelcome to SEQUANA -- Coverage standalone
52
53	Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM
54	file. In addition, running median used in conjunction with double thresholds
55	extract regions of interests (low or high coverage). A reference may be
56	provided to plot the coverage versus GC content.
57
58	The input file should be one of the following:
59
60	- a BED file that is a tabulated file at least 3 columns.
61	The first column being the reference, the second is the position
62	and the third column contains the coverage itself.
63	- or a BAM file that is converted automatically
64	into a BED file using the following command:
65
66	samtools depth -aa input.bam > output.bed
67
68	If the reference is provided, an additional plot showing the coverage versus
69	GC content is also shown.
70
71	Here are some examples
72
73	sequana_coverage --input file.bed --window-median 1001
74	sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa>
75
76	An other interesting option is to provide a BED file with 4 columns. The
77	fourth column being another coverage data created with a filter. One can
78	create such a file only from the BAM file using samtools as follows given
79	the original unfiltered BAM file named input.bam:
80
81	samtools view -q 35 -o data.filtered.bam input.bam
82	samtools depth input.bam data.filtered.bam -aa > test.bed
83	sequana_coverage --input test.bed --show-html
84
85	Note that the first file is the filtered one, and the second file is the
86	unfiltered one.
87
88	Note for multi chromosome and genbank features: for now, you will need to call
89	sequana_coverage for each chromosome individually since we accept only one
90	genbank as input parameter:
91
92	sequana_coverage --input file.bed --genbank chrom1.gbk -c 1
93
94	Large genomes:
95	--------------
96
97	If your input data is large and does not fit into memory, use the --binning BIN
98	options to average data into bin of BIN values.
99
100	CNV cases:
101	--------------
102
103	By default, sequana_coverage identify events as small as 1 bin. For the CNV
104	detection case, you may want to cluster events. the --cnv-merging DELTA option
105	merges consecutives events whose distance is smaller that DELTA
106
107
108	"""
109	)
110
111	epilog = purple(	3✔
112	"""
113	----
114
115	AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
116	Documentation: http://sequana.readthedocs.io
117	Issues: http://github.com/sequana/sequana
118	"""
119	)
120
121	description = """DESCRIPTION:	3✔
122	"""
123
124	super(Options, self).__init__(	3✔
125	usage=usage, prog=prog, description=description, epilog=epilog, formatter_class=CustomFormatter
126	)
127
128	# options to fill the config file
129	group = self.add_argument_group("Required argument")	3✔
130	group.add_argument(	3✔
131	"-i",
132	"--input",
133	dest="input",
134	type=str,
135	help=(
136	"Input file in BED or BAM format. If a BAM file is "
137	"provided, it will be converted locally to a BED file "
138	"using genomecov, which must be installed."
139	),
140	)
141
142	group = self.add_argument_group("Optional biological arguments")	3✔
143	group.add_argument(	3✔
144	"-c",
145	"--chromosome",
146	dest="chromosome",
147	type=int,
148	default=-1,
149	help="Chromosome number (if only one chromosome found, the single"
150	" chromosome is chosen automatically). Otherwise all "
151	"chromosomes are analysed. You may want to analyse only one"
152	" in which case, use this parameter (e.g., -c 1). "
153	"!!START AT INDEX 0 !!",
154	)
155	group.add_argument(	3✔
156	"-o",
157	"--circular",
158	dest="circular",
159	default=False,
160	action="store_true",
161	help="""If the DNA of the organism is circular (typically
162	viruses or bacteria), set to True""",
163	)
164
165	group = self.add_argument_group("General")	3✔
166	group.add_argument(	3✔
167	"--output-directory",
168	dest="output_directory",
169	default="report",
170	help="name of the output (report) directory.",
171	)
172	group.add_argument("-q", "--quiet", dest="verbose", default=True, action="store_false")	3✔
173	group.add_argument(	3✔
174	"--no-html",
175	dest="skip_html",
176	default=False,
177	action="store_true",
178	help="""Do not create any HTML reports. Save ROIs and statistics only.""",
179	)
180	group.add_argument(	3✔
181	"--no-multiqc",
182	dest="skip_multiqc",
183	default=False,
184	action="store_true",
185	help="""Do not create any multiqc HTML page.""",
186	)
187	group.add_argument(	3✔
188	"--debug-level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR"
189	)
190	group.add_argument(	3✔
191	"--level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR"
192	)
193	group = self.add_argument_group("Annotation")	3✔
194	group.add_argument(	3✔
195	"-a", "--annotation", dest="annotation", type=str, default=None, help="a valid gff3 or genbank annotation"
196	)
197
198	# Group related to GC content
199	group = self.add_argument_group("GC content related")	3✔
200	group.add_argument(	3✔
201	"-r",
202	"--reference",
203	dest="reference",
204	type=str,
205	default=None,
206	help="""If available, you can provide a reference (ENA/NCBI). It
207	must have the same length as the one used to create the
208	BAM or BED file. If provided, it is used to create the
209	coverage versus GC content image""",
210	)
211	group.add_argument(	3✔
212	"-g",
213	"--window-gc",
214	dest="w_gc",
215	type=int,
216	default=201,
217	help="""Length of the running window to compute the GC content""",
218	)
219	group.add_argument(	3✔
220	"-n", "--nlevels", dest="levels", type=int, default=3, help="""Number of levels in the contour"""
221	)
222
223	# group running median
224	group = self.add_argument_group("Running Median and clustering related")	3✔
225	group.add_argument(	3✔
226	"-w",
227	"--window-median",
228	dest="w_median",
229	type=int,
230	help="""Length of the running median window (default 20001,
231	recommended for bacteria). For short genome (below 100000
232	bases), we set this parameter to one fifth of the genome
233	length.""",
234	default=20001,
235	)
236
237	group.add_argument(	3✔
238	"-k",
239	"--mixture-models",
240	dest="k",
241	type=int,
242	help="""Number of mixture models to use (default 2, although if sequencing
243	depth is below 8, k is set to 1 automatically). To ignore that behaviour
244	set k to the required value""",
245	default=2,
246	)
247
248	group.add_argument(	3✔
249	"-L",
250	"--low-threshold",
251	dest="low_threshold",
252	default=None,
253	type=float,
254	help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
255	)
256	group.add_argument(	3✔
257	"-H",
258	"--high-threshold",
259	dest="high_threshold",
260	default=None,
261	type=float,
262	help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
263	)
264	group.add_argument(	3✔
265	"-T",
266	"--threshold",
267	dest="threshold",
268	default=4,
269	type=float,
270	help="""set lower and higher thresholds of the confidence interval.""",
271	)
272	group.add_argument(	3✔
273	"-C",
274	"--clustering-parameter",
275	dest="double_threshold",
276	default=0.5,
277	type=float,
278	help="""set lower and higher double threshold parameter (in [0,1]).
279	Do not use value close to zero. Ideally, around 0.5. lower value will tend to
280	cluster more than higher value""",
281	)
282
283	group = self.add_argument_group("Large data related - CNV detection")	3✔
284	group.add_argument(	3✔
285	"-s",
286	"--chunk-size",
287	dest="chunksize",
288	type=int,
289	default=10000000,
290	min=1000000,
291	action=Min,
292	help="""Length of the chunk to be used for the analysis. """,
293	)
294	group.add_argument(	3✔
295	"-B",
296	"--binning",
297	dest="binning",
298	type=int,
299	default=None,
300	min=2,
301	action=Min,
302	help="""merge consecutive (non overlapping) data points, taking the
303	mean. This is useful for large genome (e.g. human). This allows a faster
304	computation, especially for CNV detection were only large windows are of
305	interest. For instance, using a binning of 50 or 100 allows the human genome to
306	be analysed.""",
307	)
308	group.add_argument(	3✔
309	"--cnv-clustering",
310	dest="cnv_clustering",
311	default=-1,
312	type=int,
313	help="""Two consecutive ROIs are merged when their distance in bases
314	is below this parameter. If set to -1, not used. """,
315	)
316
317	# group facilities
318	group = self.add_argument_group("Download reference")	3✔
319	group.add_argument("--download-reference", dest="download_reference", default=None, type=str)	3✔
320	group.add_argument("--download-genbank", dest="download_genbank", default=None, type=str)	3✔
321	group.add_argument(	3✔
322	"--database",
323	dest="database",
324	default="ENA",
325	type=str,
326	choices=["ENA", "EUtils"],
327	help="Download the reference from one of these database (default ENA)",
328	)
329
330
331	def main(args=None):	3✔
332	if args is None:	3✔
333	args = sys.argv[:]	×
334
335	user_options = Options(prog="sequana")	3✔
336
337	# If --help or no options provided, show the help
338	if len(args) == 1:	3✔
339	user_options.parse_args(["prog", "--help"])	×
340	else:
341	options = user_options.parse_args(args[1:])	3✔
342
343	logger.setLevel(options.logging_level)	3✔
344
345	if options.download_reference:	3✔
346	logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database))	3✔
347
348	from bioservices.apps import download_fasta as df	3✔
349
350	df.download_fasta(options.download_reference, method=options.database)	3✔
351	if options.download_genbank is None:	3✔
352	return	3✔
353
354	if options.download_genbank:	3✔
355	logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database))	3✔
356	from sequana.snpeff import download_fasta_and_genbank	3✔
357
358	download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False)	3✔
359	return	3✔
360
361	if options.annotation:	3✔
362	assert os.path.exists(options.annotation), "%s does not exists" % options.annotation	3✔
363
364	logger.info(f"Reading {options.input}. This may take time ")	3✔
365
366	# Convert BAM to BED
367	if options.input.endswith(".bam"):	3✔
368	bedfile = options.input.replace(".bam", ".bed")	×
369	logger.info("Converting BAM into BED file")	×
370	shellcmd(f"bedtools genomecov -d -ibam {option.input} > {bedfile}")	×
371	elif options.input.endswith(".bed"):	3✔
372	bedfile = options.input	3✔
373	else:
374	raise ValueError("Input file must be a BAM or BED file")	×
375
376	# Set the thresholds
377	if options.low_threshold is None:	3✔
378	options.low_threshold = -options.threshold	3✔
379
380	if options.high_threshold is None:	3✔
381	options.high_threshold = options.threshold	3✔
382
383	# Now we can create the instance of GenomeCoverage
384	if options.chromosome == -1:	3✔
385	chrom_list = []	3✔
386	else:
387	chrom_list = [options.chromosome]	×
388
389	#
390	gc = GenomeCov(	3✔
391	bedfile,
392	options.annotation,
393	options.low_threshold,
394	options.high_threshold,
395	options.double_threshold,
396	options.double_threshold,
397	chunksize=options.chunksize,
398	chromosome_list=chrom_list,
399	)
400
401	# if we have the reference, let us use it
402	if options.reference:	3✔
403	logger.info("Computing GC content")	3✔
404	gc.compute_gc_content(options.reference, options.w_gc, options.circular)	3✔
405
406	# Now we scan the chromosomes,
407	if len(gc.chrom_names) == 1:	3✔
408	logger.warning("There is only one chromosome. Selected automatically.")	3✔
409	run_analysis(gc.chr_list[0], options)	3✔
410	elif options.chromosome < -1 or options.chromosome > len(gc.chrom_names):	×
411	msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))	×
412	logger.error(msg)	×
413	sys.exit(1)	×
414	else:
415	if options.chromosome == -1:	×
416	chromosomes = gc.chrom_names # take all chromosomes	×
417	else:
418	# For user, we start at position 1 but in python, we start at zero
419	chromosomes = [gc.chrom_names[options.chromosome - 1]]	×
420
421	logger.info("There are %s chromosomes/contigs." % len(gc))	×
422	for this in gc.chrom_names:	×
423	end = gc.positions[this]["end"]	×
424	start = gc.positions[this]["start"]	×
425	data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)	×
426	logger.info(" {} (starting pos: {}, ending pos: {}, length: {})".format(*data))	×
427
428	# here we read chromosome by chromosome to save memory.
429	for i, chrom in enumerate(chromosomes):	×
430	logger.info("==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i]))	×
431	# since we read just one contig/chromosome, the chr_list contains
432	# only one contig, so we access to it with index 0
433
434	chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)	×
435
436	run_analysis(chrom_data, options)	×
437	# logging level seems to be reset to warning somewhere
438	logger.setLevel(options.logging_level)	×
439
440	if options.skip_multiqc is False:	3✔
441	logger.info("Creating multiqc report")	3✔
442	pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")	3✔
443	cmd = "multiqc . -m sequana_coverage -f -c {} ".format(pathtocfg)	3✔
444	import subprocess	3✔
445
446	proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)	3✔
447	proc.wait()	3✔
448	# stdout=subprocess.PIPE, stderr=subprocess.PIPE)
449	# out, err = proc.communicate()
450	# with open("multiqc.log", "w") as fout:
451	# fout.write(err.decode())
452	logger.info("Done")	3✔
453
454
455	def run_analysis(chrom, options):	3✔
456	logger.info("Computing some metrics")	3✔
457	if chrom.DOC < 8:	3✔
458	logger.warning(	×
459	"The depth of coverage is below 8. sequana_coverage is"
460	" not optimised for such depth. You may want to "
461	" increase the threshold to avoid too many false detections"
462	)
463	logger.info(chrom.__str__())	3✔
464
465	if options.w_median > len(chrom.df) / 4:	3✔
466	NW = int(len(chrom.df) / 4)	×
467	if NW % 2 == 0:	×
468	NW += 1	×
469	logger.warning(	×
470	"median window length is too long. \n"
471	" Setting the window length automatically to a fifth of\n"
472	" the chromosome length ({})".format(NW)
473	)
474	else:
475	NW = options.w_median	3✔
476
477	######################### DEFINES OUTPUT DIR AND SAMPLE NAME ###########
478	config.output_dir = options.output_directory	3✔
479	config.sample_name = os.path.basename(options.input).split(".")[0]	3✔
480	#########################################################################
481
482	# compute the running median, zscore and ROIs for each chunk summarizing the
483	# results in a ChromosomeCovMultiChunk instane
484	logger.info("Using running median (w=%s)" % NW)	3✔
485	logger.info("Number of mixture models %s " % options.k)	3✔
486	results = chrom.run(	3✔
487	NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
488	)
489
490	# Print some info related to the fitted mixture models
491	try:	3✔
492	mu = results.data[0][0].as_dict()["data"]["fit_mu"]	3✔
493	sigma = results.data[0][0].as_dict()["data"]["fit_sigma"]	3✔
494	pi = results.data[0][0].as_dict()["data"]["fit_pi"]	3✔
495	logger.info(	3✔
496	"Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
497	% (round(mu, 3), round(sigma, 3), round(pi, 3))
498	)
499	except:	×
500	pass	×
501
502	# some information about the ROIs found
503	high = chrom.thresholds.high2	3✔
504	low = chrom.thresholds.low2	3✔
505	logger.info(	3✔
506	"Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
507	chrom.thresholds.low, chrom.thresholds.high, low, high
508	)
509	)
510	ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane	3✔
511	logger.info("Number of ROIs found: {}".format(len(ROIs.df)))	3✔
512	logger.info(" - below average: {}".format(len(ROIs.get_low_rois())))	3✔
513	logger.info(" - above average: {}".format(len(ROIs.get_high_rois())))	3✔
514
515	# Create directory and save ROIs
516	directory = options.output_directory	3✔
517
518	directory = "{}/{}".format(options.output_directory, chrom.chrom_name)	3✔
519	mkdirs(directory)	3✔
520	ROIs.df.to_csv("{}/rois.csv".format(directory))	3✔
521
522	# save summary and metrics
523	logger.info("Computing extra metrics")	3✔
524	summary = results.get_summary(caller="sequana_coverage")	3✔
525
526	summary.to_json("{}/sequana_summary_coverage.json".format(directory))	3✔
527	logger.info("Evenness: {}".format(summary.data["evenness"]))	3✔
528	logger.info("Centralness (3 sigma): {}".format(summary.data["C3"]))	3✔
529	logger.info("Centralness (4 sigma): {}".format(summary.data["C4"]))	3✔
530
531	if options.skip_html:	3✔
532	return	3✔
533
534	chrom.plot_coverage("{}/coverage.png".format(directory))	3✔
535	logger.info("Creating report in %s. Please wait" % options.output_directory)	3✔
536
537	if chrom._mode == "chunks":	3✔
538	logger.warning(("This chromosome is large. " "Plots in the HTML reports are skipped"))	×
539
540	datatable = CoverageModule.init_roi_datatable(ROIs)	3✔
541
542	# sample name not important for the standalone
543	config.sample_name = "subreports"	3✔
544
545	ChromosomeCoverageModule(	3✔
546	chrom,
547	datatable,
548	options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
549	command=" ".join(["sequana_coverage"] + sys.argv[1:]),
550	)
551	# directory=options.output_directory)
552
553
554	if __name__ == "__main__": # pragma: no cover
555	main()

cokelaer / sequana / 6630101806

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous