7805667516

Committed 06 Feb 2024 08:25PM UTC coverage: 75.585%. Remained the same

Build # 7805667516

Build Type

push

github

Committed by

cokelaer

Commit Message

Fix deprecated warning

Run Details

2 of 2 new or added lines in 1 file covered. (100.0%)

12 existing lines in 1 file now uncovered.

13829 of 18296 relevant lines covered (75.58%)

2.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.52

/sequana/scripts/coverage.py

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
""".. rubric:: Standalone application dedicated to coverage"""
import argparse
import gc as garbage
import glob
import os
import subprocess
import sys

import colorlog
import rich_click as click
from easydev import AttrDict, shellcmd

from sequana import sequana_data
from sequana import version as sequana_version
from sequana.bedtools import ChromosomeCov, SequanaCoverage
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
from sequana.scripts.common import teardown
from sequana.utils import config

from .utils import CONTEXT_SETTINGS

logger = colorlog.getLogger(__name__)


CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


click.rich_click.OPTION_GROUPS = {
    "sequana_coverage": [
        {
            "name": "Required",
            "options": ["--input-file"],
        },
        {
            "name": "Detection Algorithm Options",
            "options": [
                "--circular",
                "--mixture-models",
                "--window-median",
                "--window-gc",
                "--low-threshold",
                "--high-threshold",
                "--threshold",
                "--cnv-clustering",
                "--clustering-parameter",
            ],
        },
        {
            "name": "Modifiers",
            "options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
        },
        {
            "name": "Download utilities",
            "options": ["--download-reference", "--download-genbank", "--database"],
        },
        {
            "name": "Output files",
            "options": ["--no-multiqc", "--output-directory"],
        },
        {
            "name": "Behaviour",
            "options": ["--version", "--level", "--debug-level", "--help"],
        },
    ],
}


def download_reference(ctx, param, value):
    if value:
        database = ctx.params.get("database", "ENA")
        click.echo(f"Downloading reference {value} from {database}...", nl=False)
        from bioservices.apps import download_fasta as df

        df.download_fasta(value, method="ENA")
        click.echo("done")
        sys.exit(0)
    return value


def download_genbank(ctx, param, value):
    if value:
        click.echo(f"Downloading genbank {value}...", nl=False)
        from sequana.snpeff import download_fasta_and_genbank

        download_fasta_and_genbank(value, value, genbank=True, fasta=False)
        click.echo("done")
        sys.exit(0)
    return value


@click.command(context_settings=CONTEXT_SETTINGS)
@click.option(
    "--database",
    "database",
    default="ENA",
    show_default=True,
    type=click.Choice(["ENA", "EUtils"]),
    is_eager=True,
    # expose_value=False,
    help="Download the reference from one of these database (default ENA)",
)
@click.option(
    "--download-reference",
    "download_reference",
    default=None,
    type=click.STRING,
    callback=download_reference,
    is_eager=True,
    # expose_value=False,
    help="Provide accession number to download a reference from NCBI/ENA",
)
@click.option(
    "--download-genbank",
    "download_genbank",
    default=None,
    type=click.STRING,
    callback=download_genbank,
    is_eager=True,
    help="Provide accession number to download annotation from NCBI/ENA",
)
@click.option(
    "-i",
    "--input-file",
    "input",
    type=click.Path(),
    required=True,
    help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
)
@click.option(
    "-a",
    "--annotation-file",
    "annotation",
    type=click.STRING,
    default=None,
    help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
)
@click.version_option(sequana_version)
@click.option(
    "-o",
    "--circular",
    is_flag=True,
    help="""If the DNA of the organism is circular (typically
            viruses or bacteria), set to True""",
)
@click.option(
    "-c",
    "--chromosome",
    "chromosome",
    type=click.STRING,
    default="",
    show_default=True,
    help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
)
@click.option(
    "-k",
    "--mixture-models",
    "k",
    type=click.INT,
    help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
    default=2,
    show_default=True,
)
@click.option(
    "--debug-level",
    "logging_level",
    default="INFO",
    show_default=True,
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
)
@click.option(
    "--level",
    "logging_level",
    default="INFO",
    show_default=True,
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
)
@click.option(
    "-g",
    "--window-gc",
    "w_gc",
    type=click.INT,
    default=101,
    show_default=True,
    help="Length of the running window to compute the GC content",
)
@click.option(
    "--no-multiqc",
    "skip_multiqc",
    is_flag=True,
    show_default=True,
    help="Do not create any multiqc HTML page.",
)
@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")
@click.option(
    "-r",
    "--reference-file",
    "reference",
    type=click.STRING,
    default=None,
    help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
)
@click.option(
    "-w",
    "--window-median",
    "w_median",
    type=click.IntRange(200),
    help="Length of the running median window (default 20,001, recommended for bacteria).  For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
    default=20001,
    show_default=True,
)
@click.option(
    "-L",
    "--low-threshold",
    "low_threshold",
    default=-4,
    show_default=True,
    type=click.FLOAT,
    help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
)
@click.option(
    "-H",
    "--high-threshold",
    "high_threshold",
    default=4,
    show_default=True,
    type=click.FLOAT,
    help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
)
@click.option(
    "-T",
    "--threshold",
    "threshold",
    default=4,
    type=click.FLOAT,
    show_default=True,
    help="set lower and higher thresholds of the confidence interval. ",
)
@click.option(
    "-C",
    "--clustering-parameter",
    "double_threshold",
    default=0.5,
    show_default=True,
    type=click.FLOAT,
    help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
)
@click.option(
    "-s",
    "--chunk-size",
    "chunksize",
    type=click.IntRange(1000000),  # 1e6
    default=10000000,  # 1e7
    show_default=True,
    help="Length of the chunk to be used for the analysis. ",
)
@click.option(
    "-B",
    "--binning",
    "binning",
    type=click.IntRange(2),
    default=None,
    help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
)
@click.option(
    "--cnv-clustering",
    "cnv_clustering",
    default=-1,
    type=click.INT,
    help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
)
def main(**kwargs):
    """Welcome to SEQUANA -- Coverage standalone

        ----

       Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
       file. In addition, the running median used in conjunction with double thresholds
       extract regions of interests (ROI) for low or high coverage. A reference may be
       provided to plot the coverage versus GC content. An annotation file may be provided
       to annotate the found ROIs.

       The input file should be one of the following:

       - a BED file that is a tabulated file at least 3 columns.
         The first column being the reference, the second is the position
         and the third column contains the coverage itself.
       - or a BAM file that is converted automatically
         into a BED file using the following command:

            bedtools genomecov -d -ibam FILE.bam

       Note that this is different from another command that could perform the conversion:

           samtools depth -aa input.bam > output.bed

       If the reference is provided, an additional plot showing the coverage versus
       GC content is also shown.

       Here are some examples

           sequana_coverage --input-file file.bed --window-median 1001
           sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>

       An other interesting option is to provide a BED file with 4 columns. The
       fourth column being another coverage data created with a filter. One can
       create such a file only from the BAM file using samtools as follows given
       the original unfiltered BAM file named input.bam:

           samtools view -q 35  -o data.filtered.bam input.bam
           samtools depth input.bam data.filtered.bam  -aa > test.bed
           sequana_coverage --input test.bed --show-html

       Note that the first file is the filtered one, and the second file is the
       unfiltered one.

       Note for multi chromosome and genbank features: for now, you will need to call
       sequana_coverage for each chromosome individually since we accept only one
       genbank as input parameter:

           sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1

       Large genomes:
       --------------

       If your input data is large and does not fit into memory, use the --binning BIN
       options to average data into bin of BIN values.

       CNV cases:
       --------------

       By default, sequana_coverage identify events as small as 1 bin. For the CNV
       detection case, you may want to cluster events. the --cnv-merging DELTA option
       merges consecutives events whose distance is smaller that DELTA


    ----

    AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
    Documentation: http://sequana.readthedocs.io
    Issues: http://github.com/sequana/sequana
    """
    options = AttrDict(**kwargs)

    logger.setLevel(options.logging_level)

    if options.annotation:
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation

    logger.info(f"Reading {options.input}. This may take time ")

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # Now we can create the instance of GenomeCoverage
    if "," in options.chromosome:
        chrom_list = options.chromosome.split(",")
    elif options.chromosome:
        chrom_list = [options.chromosome]
    else:
        chrom_list = []

    # initialisation (reading BED to get positions of chromosomes and chromosome names
    gc = SequanaCoverage(
        bedfile,
        options.annotation,
        options.low_threshold,
        options.high_threshold,
        options.double_threshold,
        options.double_threshold,
        chunksize=options.chunksize,
        chromosome_list=chrom_list,
        reference_file=options.reference,
        gc_window_size=options.w_gc,
        # force=True
    )

    # some information fo end-users,
    logger.info("There are %s chromosomes/contigs." % len(gc))
    for this in gc.chrom_names:
        end = gc.positions[this]["end"]
        start = gc.positions[this]["start"]
        data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
        logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))

    # here we read chromosome by chromosome to save memory and analyse them.
    N = len(gc)
    for i, chrom in enumerate(gc.chrom_names[::-1]):
        logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")
        # since we read just one contig/chromosome, the chr_list contains
        # only one contig, so we access to it with index 0

        # Performs the computation and reporting for a given chromosome
        # This call performs the analysis, and creates the HTML page
        # if HTML is created, it fills the gc._html_list variable
        chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
        run_analysis(chrom_data, options)

        del chrom_data  # free memory for sure
        garbage.collect()

        # logging level seems to be reset to warning somewhere
        logger.setLevel(options.logging_level)

    CoverageModule(gc)

    if options.skip_multiqc is False:
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "

        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()

    teardown(options.output_directory)


def run_analysis(chrom, options):
    logger.info("Computing some metrics")

    if options.w_median > len(chrom) / 4:
        NW = int(len(chrom) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning(
            "median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW)
        )
    else:
        NW = options.w_median

    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split(".")[0]

    directory = options.output_directory
    os.makedirs(directory, exist_ok=True)
    directory = f"{options.output_directory}/{chrom.chrom_name}"
    os.makedirs(directory, exist_ok=True)
    #########################################################################

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info("Using running median (w=%s)" % NW)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
    )
    chrom.plot_coverage(f"{directory}/coverage.png")

    if chrom.DOC < 8:
        logger.warning(
            "The depth of coverage is below 8. sequana_coverage is"
            " not optimised for such depth. You may want to "
            " increase the threshold to avoid too many false detections"
        )
    logger.info(chrom.__str__())

    # save summary and metrics
    summary = results.get_summary(caller="sequana_coverage")
    summary.to_json(f"{directory}/sequana_summary_coverage.json")

    mu = summary.data.get("fit_mu", 0)
    sigma = summary.data.get("fit_sigma", 0)
    pi = summary.data.get("pi", 0)

    logger.info(
        "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
        % (round(mu, 3), round(sigma, 3), round(pi, 3))
    )

    # some information about the ROIs found
    logger.info(
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
            chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
        )
    )
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instance
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    ROIs.df.to_csv(f"{directory}/rois.csv")

    logger.info(f"Creating report in {options.output_directory}. Please wait")

    if chrom._mode == "chunks":
        logger.warning("This chromosome is large. Plots in the HTML reports are skipped")

    ChromosomeCoverageModule(
        chrom,
        datatable=CoverageModule.init_roi_datatable(ROIs),
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
        command=" ".join(["sequana_coverage"] + sys.argv[1:]),
    )


if __name__ == "__main__":  # pragma: no cover
    main()

1	#
2	# This file is part of Sequana software
3	#
4	# Copyright (c) 2016-2022 - Sequana Development Team
5	#
6	# Distributed under the terms of the 3-clause BSD license.
7	# The full license is in the LICENSE file, distributed with this software.
8	#
9	# website: https://github.com/sequana/sequana
10	# documentation: http://sequana.readthedocs.io
11	#
12	##############################################################################
13	""".. rubric:: Standalone application dedicated to coverage"""	3✔
14	import argparse	3✔
15	import gc as garbage	3✔
16	import glob	3✔
17	import os	3✔
18	import subprocess	3✔
19	import sys	3✔
20
21	import colorlog	3✔
22	import rich_click as click	3✔
23	from easydev import AttrDict, shellcmd	3✔
24
25	from sequana import sequana_data	3✔
26	from sequana import version as sequana_version	3✔
27	from sequana.bedtools import ChromosomeCov, SequanaCoverage	3✔
28	from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule	3✔
29	from sequana.scripts.common import teardown	3✔
30	from sequana.utils import config	3✔
31
32	from .utils import CONTEXT_SETTINGS	3✔
33
34	logger = colorlog.getLogger(__name__)	3✔
35
36
37	CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])	3✔
38
39
40	click.rich_click.OPTION_GROUPS = {	3✔
41	"sequana_coverage": [
42	{
43	"name": "Required",
44	"options": ["--input-file"],
45	},
46	{
47	"name": "Detection Algorithm Options",
48	"options": [
49	"--circular",
50	"--mixture-models",
51	"--window-median",
52	"--window-gc",
53	"--low-threshold",
54	"--high-threshold",
55	"--threshold",
56	"--cnv-clustering",
57	"--clustering-parameter",
58	],
59	},
60	{
61	"name": "Modifiers",
62	"options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
63	},
64	{
65	"name": "Download utilities",
66	"options": ["--download-reference", "--download-genbank", "--database"],
67	},
68	{
69	"name": "Output files",
70	"options": ["--no-multiqc", "--output-directory"],
71	},
72	{
73	"name": "Behaviour",
74	"options": ["--version", "--level", "--debug-level", "--help"],
75	},
76	],
77	}
78
79
80	def download_reference(ctx, param, value):	3✔
81	if value:	3✔
82	database = ctx.params.get("database", "ENA")	×
83	click.echo(f"Downloading reference {value} from {database}...", nl=False)	×
84	from bioservices.apps import download_fasta as df	×
85
86	df.download_fasta(value, method="ENA")	×
87	click.echo("done")	×
88	sys.exit(0)	×
89	return value	3✔
90
91
92	def download_genbank(ctx, param, value):	3✔
93	if value:	3✔
94	click.echo(f"Downloading genbank {value}...", nl=False)	×
95	from sequana.snpeff import download_fasta_and_genbank	×
96
97	download_fasta_and_genbank(value, value, genbank=True, fasta=False)	×
98	click.echo("done")	×
99	sys.exit(0)	×
100	return value	3✔
101
102
103	@click.command(context_settings=CONTEXT_SETTINGS)	3✔
104	@click.option(	3✔
105	"--database",
106	"database",
107	default="ENA",
108	show_default=True,
109	type=click.Choice(["ENA", "EUtils"]),
110	is_eager=True,
111	# expose_value=False,
112	help="Download the reference from one of these database (default ENA)",
113	)
114	@click.option(	3✔
115	"--download-reference",
116	"download_reference",
117	default=None,
118	type=click.STRING,
119	callback=download_reference,
120	is_eager=True,
121	# expose_value=False,
122	help="Provide accession number to download a reference from NCBI/ENA",
123	)
124	@click.option(	3✔
125	"--download-genbank",
126	"download_genbank",
127	default=None,
128	type=click.STRING,
129	callback=download_genbank,
130	is_eager=True,
131	help="Provide accession number to download annotation from NCBI/ENA",
132	)
133	@click.option(	3✔
134	"-i",
135	"--input-file",
136	"input",
137	type=click.Path(),
138	required=True,
139	help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
140	)
141	@click.option(	3✔
142	"-a",
143	"--annotation-file",
144	"annotation",
145	type=click.STRING,
146	default=None,
147	help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
148	)
149	@click.version_option(sequana_version)	3✔
150	@click.option(	3✔
151	"-o",
152	"--circular",
153	is_flag=True,
154	help="""If the DNA of the organism is circular (typically
155	viruses or bacteria), set to True""",
156	)
157	@click.option(	3✔
158	"-c",
159	"--chromosome",
160	"chromosome",
161	type=click.STRING,
162	default="",
163	show_default=True,
164	help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
165	)
166	@click.option(	3✔
167	"-k",
168	"--mixture-models",
169	"k",
170	type=click.INT,
171	help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
172	default=2,
173	show_default=True,
174	)
175	@click.option(	3✔
176	"--debug-level",
177	"logging_level",
178	default="INFO",
179	show_default=True,
180	type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
181	help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
182	)
183	@click.option(	3✔
184	"--level",
185	"logging_level",
186	default="INFO",
187	show_default=True,
188	type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
189	help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
190	)
191	@click.option(	3✔
192	"-g",
193	"--window-gc",
194	"w_gc",
195	type=click.INT,
196	default=101,
197	show_default=True,
198	help="Length of the running window to compute the GC content",
199	)
200	@click.option(	3✔
201	"--no-multiqc",
202	"skip_multiqc",
203	is_flag=True,
204	show_default=True,
205	help="Do not create any multiqc HTML page.",
206	)
207	@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")	3✔
208	@click.option(	3✔
209	"-r",
210	"--reference-file",
211	"reference",
212	type=click.STRING,
213	default=None,
214	help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
215	)
216	@click.option(	3✔
217	"-w",
218	"--window-median",
219	"w_median",
220	type=click.IntRange(200),
221	help="Length of the running median window (default 20,001, recommended for bacteria). For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
222	default=20001,
223	show_default=True,
224	)
225	@click.option(	3✔
226	"-L",
227	"--low-threshold",
228	"low_threshold",
229	default=-4,
230	show_default=True,
231	type=click.FLOAT,
232	help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
233	)
234	@click.option(	3✔
235	"-H",
236	"--high-threshold",
237	"high_threshold",
238	default=4,
239	show_default=True,
240	type=click.FLOAT,
241	help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
242	)
243	@click.option(	3✔
244	"-T",
245	"--threshold",
246	"threshold",
247	default=4,
248	type=click.FLOAT,
249	show_default=True,
250	help="set lower and higher thresholds of the confidence interval. ",
251	)
252	@click.option(	3✔
253	"-C",
254	"--clustering-parameter",
255	"double_threshold",
256	default=0.5,
257	show_default=True,
258	type=click.FLOAT,
259	help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
260	)
261	@click.option(	3✔
262	"-s",
263	"--chunk-size",
264	"chunksize",
265	type=click.IntRange(1000000), # 1e6
266	default=10000000, # 1e7
267	show_default=True,
268	help="Length of the chunk to be used for the analysis. ",
269	)
270	@click.option(	3✔
271	"-B",
272	"--binning",
273	"binning",
274	type=click.IntRange(2),
275	default=None,
276	help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
277	)
278	@click.option(	3✔
279	"--cnv-clustering",
280	"cnv_clustering",
281	default=-1,
282	type=click.INT,
283	help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
284	)
285	def main(**kwargs):	3✔
286	"""Welcome to SEQUANA -- Coverage standalone
287
288	----
289
290	Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
291	file. In addition, the running median used in conjunction with double thresholds
292	extract regions of interests (ROI) for low or high coverage. A reference may be
293	provided to plot the coverage versus GC content. An annotation file may be provided
294	to annotate the found ROIs.
295
296	The input file should be one of the following:
297
298	- a BED file that is a tabulated file at least 3 columns.
299	The first column being the reference, the second is the position
300	and the third column contains the coverage itself.
301	- or a BAM file that is converted automatically
302	into a BED file using the following command:
303
304	bedtools genomecov -d -ibam FILE.bam
305
306	Note that this is different from another command that could perform the conversion:
307
308	samtools depth -aa input.bam > output.bed
309
310	If the reference is provided, an additional plot showing the coverage versus
311	GC content is also shown.
312
313	Here are some examples
314
315	sequana_coverage --input-file file.bed --window-median 1001
316	sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>
317
318	An other interesting option is to provide a BED file with 4 columns. The
319	fourth column being another coverage data created with a filter. One can
320	create such a file only from the BAM file using samtools as follows given
321	the original unfiltered BAM file named input.bam:
322
323	samtools view -q 35 -o data.filtered.bam input.bam
324	samtools depth input.bam data.filtered.bam -aa > test.bed
325	sequana_coverage --input test.bed --show-html
326
327	Note that the first file is the filtered one, and the second file is the
328	unfiltered one.
329
330	Note for multi chromosome and genbank features: for now, you will need to call
331	sequana_coverage for each chromosome individually since we accept only one
332	genbank as input parameter:
333
334	sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1
335
336	Large genomes:
337	--------------
338
339	If your input data is large and does not fit into memory, use the --binning BIN
340	options to average data into bin of BIN values.
341
342	CNV cases:
343	--------------
344
345	By default, sequana_coverage identify events as small as 1 bin. For the CNV
346	detection case, you may want to cluster events. the --cnv-merging DELTA option
347	merges consecutives events whose distance is smaller that DELTA
348
349
350	----
351
352	AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
353	Documentation: http://sequana.readthedocs.io
354	Issues: http://github.com/sequana/sequana
355	"""
356	options = AttrDict(**kwargs)	3✔
357
358	logger.setLevel(options.logging_level)	3✔
359
360	if options.annotation:	3✔
361	assert os.path.exists(options.annotation), "%s does not exists" % options.annotation	3✔
362
363	logger.info(f"Reading {options.input}. This may take time ")	3✔
364
365	# Convert BAM to BED
366	if options.input.endswith(".bam"):	3✔
UNCOV 367	bedfile = options.input.replace(".bam", ".bed")	×
UNCOV 368	logger.info("Converting BAM into BED file")	×
369	shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")	×
370	elif options.input.endswith(".bed"):	3✔
371	bedfile = options.input	3✔
372	else:
373	raise ValueError("Input file must be a BAM or BED file")	×
374
375	# Set the thresholds
376	if options.low_threshold is None:	3✔
UNCOV 377	options.low_threshold = -options.threshold	×
378
379	if options.high_threshold is None:	3✔
UNCOV 380	options.high_threshold = options.threshold	×
381
382	# Now we can create the instance of GenomeCoverage
383	if "," in options.chromosome:	3✔
UNCOV 384	chrom_list = options.chromosome.split(",")	×
385	elif options.chromosome:	3✔
UNCOV 386	chrom_list = [options.chromosome]	×
387	else:
388	chrom_list = []	3✔
389
390	# initialisation (reading BED to get positions of chromosomes and chromosome names
391	gc = SequanaCoverage(	3✔
392	bedfile,
393	options.annotation,
394	options.low_threshold,
395	options.high_threshold,
396	options.double_threshold,
397	options.double_threshold,
398	chunksize=options.chunksize,
399	chromosome_list=chrom_list,
400	reference_file=options.reference,
401	gc_window_size=options.w_gc,
402	# force=True
403	)
404
405	# some information fo end-users,
406	logger.info("There are %s chromosomes/contigs." % len(gc))	3✔
407	for this in gc.chrom_names:	3✔
408	end = gc.positions[this]["end"]	3✔
409	start = gc.positions[this]["start"]	3✔
410	data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)	3✔
411	logger.info(" {} (starting pos: {}, ending pos: {}, length: {})".format(*data))	3✔
412
413	# here we read chromosome by chromosome to save memory and analyse them.
414	N = len(gc)	3✔
415	for i, chrom in enumerate(gc.chrom_names[::-1]):	3✔
416	logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")	3✔
417	# since we read just one contig/chromosome, the chr_list contains
418	# only one contig, so we access to it with index 0
419
420	# Performs the computation and reporting for a given chromosome
421	# This call performs the analysis, and creates the HTML page
422	# if HTML is created, it fills the gc._html_list variable
423	chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)	3✔
424	run_analysis(chrom_data, options)	3✔
425
426	del chrom_data # free memory for sure	3✔
427	garbage.collect()	3✔
428
429	# logging level seems to be reset to warning somewhere
430	logger.setLevel(options.logging_level)	3✔
431
432	CoverageModule(gc)	3✔
433
434	if options.skip_multiqc is False:	3✔
435	logger.info("Creating multiqc report")	3✔
436	pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")	3✔
437	cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "	3✔
438
439	proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)	3✔
440	proc.wait()	3✔
441
442	teardown(options.output_directory)	3✔
443
444
445	def run_analysis(chrom, options):	3✔
446	logger.info("Computing some metrics")	3✔
447
448	if options.w_median > len(chrom) / 4:	3✔
UNCOV 449	NW = int(len(chrom) / 4)	×
UNCOV 450	if NW % 2 == 0:	×
UNCOV 451	NW += 1	×
UNCOV 452	logger.warning(	×
453	"median window length is too long. \n"
454	" Setting the window length automatically to a fifth of\n"
455	" the chromosome length ({})".format(NW)
456	)
457	else:
458	NW = options.w_median	3✔
459
460	######################### DEFINES OUTPUT DIR AND SAMPLE NAME ###########
461	config.output_dir = options.output_directory	3✔
462	config.sample_name = os.path.basename(options.input).split(".")[0]	3✔
463
464	directory = options.output_directory	3✔
465	os.makedirs(directory, exist_ok=True)	3✔
466	directory = f"{options.output_directory}/{chrom.chrom_name}"	3✔
467	os.makedirs(directory, exist_ok=True)	3✔
468	#########################################################################
469
470	# compute the running median, zscore and ROIs for each chunk summarizing the
471	# results in a ChromosomeCovMultiChunk instane
472	logger.info("Using running median (w=%s)" % NW)	3✔
473	logger.info("Number of mixture models %s " % options.k)	3✔
474	results = chrom.run(	3✔
475	NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
476	)
477	chrom.plot_coverage(f"{directory}/coverage.png")	3✔
478
479	if chrom.DOC < 8:	3✔
UNCOV 480	logger.warning(	×
481	"The depth of coverage is below 8. sequana_coverage is"
482	" not optimised for such depth. You may want to "
483	" increase the threshold to avoid too many false detections"
484	)
485	logger.info(chrom.__str__())	3✔
486
487	# save summary and metrics
488	summary = results.get_summary(caller="sequana_coverage")	3✔
489	summary.to_json(f"{directory}/sequana_summary_coverage.json")	3✔
490
491	mu = summary.data.get("fit_mu", 0)	3✔
492	sigma = summary.data.get("fit_sigma", 0)	3✔
493	pi = summary.data.get("pi", 0)	3✔
494
495	logger.info(	3✔
496	"Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
497	% (round(mu, 3), round(sigma, 3), round(pi, 3))
498	)
499
500	# some information about the ROIs found
501	logger.info(	3✔
502	"Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
503	chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
504	)
505	)
506	ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instance	3✔
507	logger.info("Number of ROIs found: {}".format(len(ROIs.df)))	3✔
508	logger.info(" - below average: {}".format(len(ROIs.get_low_rois())))	3✔
509	logger.info(" - above average: {}".format(len(ROIs.get_high_rois())))	3✔
510
511	# Create directory and save ROIs
512	ROIs.df.to_csv(f"{directory}/rois.csv")	3✔
513
514	logger.info(f"Creating report in {options.output_directory}. Please wait")	3✔
515
516	if chrom._mode == "chunks":	3✔
UNCOV 517	logger.warning("This chromosome is large. Plots in the HTML reports are skipped")	×
518
519	ChromosomeCoverageModule(	3✔
520	chrom,
521	datatable=CoverageModule.init_roi_datatable(ROIs),
522	options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
523	command=" ".join(["sequana_coverage"] + sys.argv[1:]),
524	)
525
526
527	if __name__ == "__main__": # pragma: no cover
528	main()

cokelaer / sequana / 7805667516

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous