7093643877

Committed 04 Dec 2023 11:00PM UTC coverage: 73.729% (+0.3%) from 73.466%

Build # 7093643877

Build Type

push

github

Committed by

cokelaer

Commit Message

rename plot legend

Run Details

0 of 1 new or added line in 1 file covered. (0.0%)

271 existing lines in 19 files now uncovered.

13398 of 18172 relevant lines covered (73.73%)

2.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.52

/sequana/scripts/coverage.py

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
""".. rubric:: Standalone application dedicated to coverage"""
import argparse
import gc as garbage
import glob
import os
import subprocess
import sys

import rich_click as click

from .utils import CONTEXT_SETTINGS


import colorlog
from easydev import AttrDict, shellcmd

from sequana import sequana_data
from sequana import version as sequana_version
from sequana.bedtools import ChromosomeCov, SequanaCoverage
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
from sequana.utils import config
from sequana.scripts.common import teardown

logger = colorlog.getLogger(__name__)


CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


click.rich_click.OPTION_GROUPS = {
    "sequana_coverage": [
        {
            "name": "Required",
            "options": ["--input-file"],
        },
        {
            "name": "Detection Algorithm Options",
            "options": [
                "--circular",
                "--mixture-models",
                "--window-median",
                "--window-gc",
                "--low-threshold",
                "--high-threshold",
                "--threshold",
                "--cnv-clustering",
                "--clustering-parameter",
            ],
        },
        {
            "name": "Modifiers",
            "options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
        },
        {
            "name": "Download utilities",
            "options": ["--download-reference", "--download-genbank", "--database"],
        },
        {
            "name": "Output files",
            "options": ["--no-multiqc", "--output-directory"],
        },
        {
            "name": "Behaviour",
            "options": ["--version", "--level", "--debug-level", "--help"],
        },
    ],
}


def download_reference(ctx, param, value):
    if value:
        database = ctx.params.get("database", "ENA")
        click.echo(f"Downloading reference {value} from {database}...", nl=False)
        from bioservices.apps import download_fasta as df

        df.download_fasta(value, method="ENA")
        click.echo("done")
        sys.exit(0)
    return value


def download_genbank(ctx, param, value):
    if value:
        click.echo(f"Downloading genbank {value}...", nl=False)
        from sequana.snpeff import download_fasta_and_genbank

        download_fasta_and_genbank(value, value, genbank=True, fasta=False)
        click.echo("done")
        sys.exit(0)
    return value


@click.command(context_settings=CONTEXT_SETTINGS)
@click.option(
    "--database",
    "database",
    default="ENA",
    type=click.Choice(["ENA", "EUtils"]),
    is_eager=True,
    # expose_value=False,
    help="Download the reference from one of these database (default ENA)",
)
@click.option(
    "--download-reference",
    "download_reference",
    default=None,
    type=click.STRING,
    callback=download_reference,
    is_eager=True,
    # expose_value=False,
    help="Provide accession number to download a reference from NCBI/ENA",
)
@click.option(
    "--download-genbank",
    "download_genbank",
    default=None,
    type=click.STRING,
    callback=download_genbank,
    is_eager=True,
    help="Provide accession number to download annotation from NCBI/ENA",
)
@click.option(
    "-i",
    "--input-file",
    "input",
    type=click.Path(),
    required=True,
    help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
)
@click.option(
    "-a",
    "--annotation-file",
    "annotation",
    type=click.STRING,
    default=None,
    help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
)
@click.version_option(sequana_version)
@click.option(
    "-o",
    "--circular",
    is_flag=True,
    show_default=True,
    # help="""If the DNA of the organism is circular (typically
    #        viruses or bacteria), set to True"""
)
@click.option(
    "-c",
    "--chromosome",
    "chromosome",
    type=click.STRING,
    default="",
    help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
)
@click.option(
    "-k",
    "--mixture-models",
    "k",
    type=click.INT,
    help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
    default=2,
)
@click.option(
    "--debug-level",
    "logging_level",
    default="INFO",
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
)
@click.option(
    "--level",
    "logging_level",
    default="INFO",
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
)
@click.option(
    "-g",
    "--window-gc",
    "w_gc",
    type=click.INT,
    default=101,
    help="Length of the running window to compute the GC content",
)
@click.option(
    "--no-multiqc",
    "skip_multiqc",
    is_flag=True,
    show_default=True,
    help="Do not create any multiqc HTML page.",
)
@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")
@click.option(
    "-r",
    "--reference-file",
    "reference",
    type=click.STRING,
    default=None,
    help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
)
@click.option(
    "-w",
    "--window-median",
    "w_median",
    type=click.IntRange(200),
    help="Length of the running median window (default 20,001, recommended for bacteria).  For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
    default=20001,
)
@click.option(
    "-L",
    "--low-threshold",
    "low_threshold",
    default=-4,
    type=click.FLOAT,
    help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
)
@click.option(
    "-H",
    "--high-threshold",
    "high_threshold",
    default=4,
    type=click.FLOAT,
    help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
)
@click.option(
    "-T",
    "--threshold",
    "threshold",
    default=4,
    type=click.FLOAT,
    help="set lower and higher thresholds of the confidence interval. ",
)
@click.option(
    "-C",
    "--clustering-parameter",
    "double_threshold",
    default=0.5,
    type=click.FLOAT,
    help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
)
@click.option(
    "-s",
    "--chunk-size",
    "chunksize",
    type=click.IntRange(1000000),  # 1e6
    default=10000000,  # 1e7
    help="Length of the chunk to be used for the analysis. ",
)
@click.option(
    "-B",
    "--binning",
    "binning",
    type=click.IntRange(2),
    default=None,
    help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
)
@click.option(
    "--cnv-clustering",
    "cnv_clustering",
    default=-1,
    type=click.INT,
    help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
)
def main(**kwargs):
    """Welcome to SEQUANA -- Coverage standalone

        ----

       Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
       file. In addition, the running median used in conjunction with double thresholds
       extract regions of interests (ROI) for low or high coverage. A reference may be
       provided to plot the coverage versus GC content. An annotation file may be provided
       to annotate the found ROIs.

       The input file should be one of the following:

       - a BED file that is a tabulated file at least 3 columns.
         The first column being the reference, the second is the position
         and the third column contains the coverage itself.
       - or a BAM file that is converted automatically
         into a BED file using the following command:

            bedtools genomecov -d -ibam FILE.bam

       Note that this is different from another command that could perform the conversion:

           samtools depth -aa input.bam > output.bed

       If the reference is provided, an additional plot showing the coverage versus
       GC content is also shown.

       Here are some examples

           sequana_coverage --input-file file.bed --window-median 1001
           sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>

       An other interesting option is to provide a BED file with 4 columns. The
       fourth column being another coverage data created with a filter. One can
       create such a file only from the BAM file using samtools as follows given
       the original unfiltered BAM file named input.bam:

           samtools view -q 35  -o data.filtered.bam input.bam
           samtools depth input.bam data.filtered.bam  -aa > test.bed
           sequana_coverage --input test.bed --show-html

       Note that the first file is the filtered one, and the second file is the
       unfiltered one.

       Note for multi chromosome and genbank features: for now, you will need to call
       sequana_coverage for each chromosome individually since we accept only one
       genbank as input parameter:

           sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1

       Large genomes:
       --------------

       If your input data is large and does not fit into memory, use the --binning BIN
       options to average data into bin of BIN values.

       CNV cases:
       --------------

       By default, sequana_coverage identify events as small as 1 bin. For the CNV
       detection case, you may want to cluster events. the --cnv-merging DELTA option
       merges consecutives events whose distance is smaller that DELTA


    ----

    AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
    Documentation: http://sequana.readthedocs.io
    Issues: http://github.com/sequana/sequana
    """
    options = AttrDict(**kwargs)

    logger.setLevel(options.logging_level)

    if options.annotation:
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation

    logger.info(f"Reading {options.input}. This may take time ")

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # Now we can create the instance of GenomeCoverage
    if "," in options.chromosome:
        chrom_list = options.chromosome.split(",")
    elif options.chromosome:
        chrom_list = [options.chromosome]
    else:
        chrom_list = []

    # initialisation (reading BED to get positions of chromosomes and chromosome names
    gc = SequanaCoverage(
        bedfile,
        options.annotation,
        options.low_threshold,
        options.high_threshold,
        options.double_threshold,
        options.double_threshold,
        chunksize=options.chunksize,
        chromosome_list=chrom_list,
        reference_file=options.reference,
        gc_window_size=options.w_gc,
    )

    # some information fo end-users,
    logger.info("There are %s chromosomes/contigs." % len(gc))
    for this in gc.chrom_names:
        end = gc.positions[this]["end"]
        start = gc.positions[this]["start"]
        data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
        logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))

    # here we read chromosome by chromosome to save memory and analyse them.
    N = len(gc)
    for i, chrom in enumerate(gc.chrom_names[::-1]):
        logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")
        # since we read just one contig/chromosome, the chr_list contains
        # only one contig, so we access to it with index 0

        # Performs the computation and reporting for a given chromosome
        # This call performs the analysis, and creates the HTML page
        # if HTML is creates, it fills the gc._html_list variable
        chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
        run_analysis(chrom_data, options)

        del chrom_data  # free memory for sure
        garbage.collect()

        # logging level seems to be reset to warning somewhere
        logger.setLevel(options.logging_level)


    CoverageModule(gc)

    if options.skip_multiqc is False:
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "

        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()

    teardown(options.output_directory)


def run_analysis(chrom, options):
    logger.info("Computing some metrics")

    if options.w_median > len(chrom) / 4:
        NW = int(len(chrom) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning(
            "median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW)
        )
    else:
        NW = options.w_median

    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split(".")[0]

    directory = options.output_directory
    os.makedirs(directory, exist_ok=True)
    directory = f"{options.output_directory}/{chrom.chrom_name}"
    os.makedirs(directory, exist_ok=True)
    #########################################################################

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info("Using running median (w=%s)" % NW)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
    )
    chrom.plot_coverage(f"{directory}/coverage.png")

    if chrom.DOC < 8:
        logger.warning(
            "The depth of coverage is below 8. sequana_coverage is"
            " not optimised for such depth. You may want to "
            " increase the threshold to avoid too many false detections"
        )
    logger.info(chrom.__str__())

    # save summary and metrics
    summary = results.get_summary(caller="sequana_coverage")
    summary.to_json(f"{directory}/sequana_summary_coverage.json")

    mu = summary.data.get("fit_mu", 0)
    sigma = summary.data.get("fit_sigma", 0)
    pi = summary.data.get("pi", 0)

    logger.info(
        "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
        % (round(mu, 3), round(sigma, 3), round(pi, 3))
        )

    # some information about the ROIs found
    logger.info(
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
            chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
        )
    )
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instance
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    ROIs.df.to_csv(f"{directory}/rois.csv")


    logger.info(f"Creating report in {options.output_directory}. Please wait")

    if chrom._mode == "chunks":
        logger.warning("This chromosome is large. Plots in the HTML reports are skipped")

    ChromosomeCoverageModule(
        chrom,
        datatable=CoverageModule.init_roi_datatable(ROIs),
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
        command=" ".join(["sequana_coverage"] + sys.argv[1:]), 
    )


if __name__ == "__main__":  # pragma: no cover
    main()

1	#
2	# This file is part of Sequana software
3	#
4	# Copyright (c) 2016-2022 - Sequana Development Team
5	#
6	# Distributed under the terms of the 3-clause BSD license.
7	# The full license is in the LICENSE file, distributed with this software.
8	#
9	# website: https://github.com/sequana/sequana
10	# documentation: http://sequana.readthedocs.io
11	#
12	##############################################################################
13	""".. rubric:: Standalone application dedicated to coverage"""	3✔
14	import argparse	3✔
15	import gc as garbage	3✔
16	import glob	3✔
17	import os	3✔
18	import subprocess	3✔
19	import sys	3✔
20
21	import rich_click as click	3✔
22
23	from .utils import CONTEXT_SETTINGS	3✔
24
25
26	import colorlog	3✔
27	from easydev import AttrDict, shellcmd	3✔
28
29	from sequana import sequana_data	3✔
30	from sequana import version as sequana_version	3✔
31	from sequana.bedtools import ChromosomeCov, SequanaCoverage	3✔
32	from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule	3✔
33	from sequana.utils import config	3✔
34	from sequana.scripts.common import teardown	3✔
35
36	logger = colorlog.getLogger(__name__)	3✔
37
38
39	CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])	3✔
40
41
42	click.rich_click.OPTION_GROUPS = {	3✔
43	"sequana_coverage": [
44	{
45	"name": "Required",
46	"options": ["--input-file"],
47	},
48	{
49	"name": "Detection Algorithm Options",
50	"options": [
51	"--circular",
52	"--mixture-models",
53	"--window-median",
54	"--window-gc",
55	"--low-threshold",
56	"--high-threshold",
57	"--threshold",
58	"--cnv-clustering",
59	"--clustering-parameter",
60	],
61	},
62	{
63	"name": "Modifiers",
64	"options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
65	},
66	{
67	"name": "Download utilities",
68	"options": ["--download-reference", "--download-genbank", "--database"],
69	},
70	{
71	"name": "Output files",
72	"options": ["--no-multiqc", "--output-directory"],
73	},
74	{
75	"name": "Behaviour",
76	"options": ["--version", "--level", "--debug-level", "--help"],
77	},
78	],
79	}
80
81
82	def download_reference(ctx, param, value):	3✔
83	if value:	3✔
UNCOV 84	database = ctx.params.get("database", "ENA")	×
UNCOV 85	click.echo(f"Downloading reference {value} from {database}...", nl=False)	×
UNCOV 86	from bioservices.apps import download_fasta as df	×
87
88	df.download_fasta(value, method="ENA")	×
89	click.echo("done")	×
90	sys.exit(0)	×
91	return value	3✔
92
93
94	def download_genbank(ctx, param, value):	3✔
95	if value:	3✔
UNCOV 96	click.echo(f"Downloading genbank {value}...", nl=False)	×
UNCOV 97	from sequana.snpeff import download_fasta_and_genbank	×
98
UNCOV 99	download_fasta_and_genbank(value, value, genbank=True, fasta=False)	×
100	click.echo("done")	×
101	sys.exit(0)	×
102	return value	3✔
103
104
105	@click.command(context_settings=CONTEXT_SETTINGS)	3✔
106	@click.option(	3✔
107	"--database",
108	"database",
109	default="ENA",
110	type=click.Choice(["ENA", "EUtils"]),
111	is_eager=True,
112	# expose_value=False,
113	help="Download the reference from one of these database (default ENA)",
114	)
115	@click.option(	3✔
116	"--download-reference",
117	"download_reference",
118	default=None,
119	type=click.STRING,
120	callback=download_reference,
121	is_eager=True,
122	# expose_value=False,
123	help="Provide accession number to download a reference from NCBI/ENA",
124	)
125	@click.option(	3✔
126	"--download-genbank",
127	"download_genbank",
128	default=None,
129	type=click.STRING,
130	callback=download_genbank,
131	is_eager=True,
132	help="Provide accession number to download annotation from NCBI/ENA",
133	)
134	@click.option(	3✔
135	"-i",
136	"--input-file",
137	"input",
138	type=click.Path(),
139	required=True,
140	help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
141	)
142	@click.option(	3✔
143	"-a",
144	"--annotation-file",
145	"annotation",
146	type=click.STRING,
147	default=None,
148	help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
149	)
150	@click.version_option(sequana_version)	3✔
151	@click.option(	3✔
152	"-o",
153	"--circular",
154	is_flag=True,
155	show_default=True,
156	# help="""If the DNA of the organism is circular (typically
157	# viruses or bacteria), set to True"""
158	)
159	@click.option(	3✔
160	"-c",
161	"--chromosome",
162	"chromosome",
163	type=click.STRING,
164	default="",
165	help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
166	)
167	@click.option(	3✔
168	"-k",
169	"--mixture-models",
170	"k",
171	type=click.INT,
172	help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
173	default=2,
174	)
175	@click.option(	3✔
176	"--debug-level",
177	"logging_level",
178	default="INFO",
179	type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
180	help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
181	)
182	@click.option(	3✔
183	"--level",
184	"logging_level",
185	default="INFO",
186	type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
187	help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
188	)
189	@click.option(	3✔
190	"-g",
191	"--window-gc",
192	"w_gc",
193	type=click.INT,
194	default=101,
195	help="Length of the running window to compute the GC content",
196	)
197	@click.option(	3✔
198	"--no-multiqc",
199	"skip_multiqc",
200	is_flag=True,
201	show_default=True,
202	help="Do not create any multiqc HTML page.",
203	)
204	@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")	3✔
205	@click.option(	3✔
206	"-r",
207	"--reference-file",
208	"reference",
209	type=click.STRING,
210	default=None,
211	help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
212	)
213	@click.option(	3✔
214	"-w",
215	"--window-median",
216	"w_median",
217	type=click.IntRange(200),
218	help="Length of the running median window (default 20,001, recommended for bacteria). For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
219	default=20001,
220	)
221	@click.option(	3✔
222	"-L",
223	"--low-threshold",
224	"low_threshold",
225	default=-4,
226	type=click.FLOAT,
227	help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
228	)
229	@click.option(	3✔
230	"-H",
231	"--high-threshold",
232	"high_threshold",
233	default=4,
234	type=click.FLOAT,
235	help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
236	)
237	@click.option(	3✔
238	"-T",
239	"--threshold",
240	"threshold",
241	default=4,
242	type=click.FLOAT,
243	help="set lower and higher thresholds of the confidence interval. ",
244	)
245	@click.option(	3✔
246	"-C",
247	"--clustering-parameter",
248	"double_threshold",
249	default=0.5,
250	type=click.FLOAT,
251	help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
252	)
253	@click.option(	3✔
254	"-s",
255	"--chunk-size",
256	"chunksize",
257	type=click.IntRange(1000000), # 1e6
258	default=10000000, # 1e7
259	help="Length of the chunk to be used for the analysis. ",
260	)
261	@click.option(	3✔
262	"-B",
263	"--binning",
264	"binning",
265	type=click.IntRange(2),
266	default=None,
267	help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
268	)
269	@click.option(	3✔
270	"--cnv-clustering",
271	"cnv_clustering",
272	default=-1,
273	type=click.INT,
274	help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
275	)
276	def main(**kwargs):	3✔
277	"""Welcome to SEQUANA -- Coverage standalone
278
279	----
280
281	Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
282	file. In addition, the running median used in conjunction with double thresholds
283	extract regions of interests (ROI) for low or high coverage. A reference may be
284	provided to plot the coverage versus GC content. An annotation file may be provided
285	to annotate the found ROIs.
286
287	The input file should be one of the following:
288
289	- a BED file that is a tabulated file at least 3 columns.
290	The first column being the reference, the second is the position
291	and the third column contains the coverage itself.
292	- or a BAM file that is converted automatically
293	into a BED file using the following command:
294
295	bedtools genomecov -d -ibam FILE.bam
296
297	Note that this is different from another command that could perform the conversion:
298
299	samtools depth -aa input.bam > output.bed
300
301	If the reference is provided, an additional plot showing the coverage versus
302	GC content is also shown.
303
304	Here are some examples
305
306	sequana_coverage --input-file file.bed --window-median 1001
307	sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>
308
309	An other interesting option is to provide a BED file with 4 columns. The
310	fourth column being another coverage data created with a filter. One can
311	create such a file only from the BAM file using samtools as follows given
312	the original unfiltered BAM file named input.bam:
313
314	samtools view -q 35 -o data.filtered.bam input.bam
315	samtools depth input.bam data.filtered.bam -aa > test.bed
316	sequana_coverage --input test.bed --show-html
317
318	Note that the first file is the filtered one, and the second file is the
319	unfiltered one.
320
321	Note for multi chromosome and genbank features: for now, you will need to call
322	sequana_coverage for each chromosome individually since we accept only one
323	genbank as input parameter:
324
325	sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1
326
327	Large genomes:
328	--------------
329
330	If your input data is large and does not fit into memory, use the --binning BIN
331	options to average data into bin of BIN values.
332
333	CNV cases:
334	--------------
335
336	By default, sequana_coverage identify events as small as 1 bin. For the CNV
337	detection case, you may want to cluster events. the --cnv-merging DELTA option
338	merges consecutives events whose distance is smaller that DELTA
339
340
341	----
342
343	AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
344	Documentation: http://sequana.readthedocs.io
345	Issues: http://github.com/sequana/sequana
346	"""
347	options = AttrDict(**kwargs)	3✔
348
349	logger.setLevel(options.logging_level)	3✔
350
351	if options.annotation:	3✔
352	assert os.path.exists(options.annotation), "%s does not exists" % options.annotation	3✔
353
354	logger.info(f"Reading {options.input}. This may take time ")	3✔
355
356	# Convert BAM to BED
357	if options.input.endswith(".bam"):	3✔
UNCOV 358	bedfile = options.input.replace(".bam", ".bed")	×
UNCOV 359	logger.info("Converting BAM into BED file")	×
UNCOV 360	shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")	×
361	elif options.input.endswith(".bed"):	3✔
362	bedfile = options.input	3✔
363	else:
UNCOV 364	raise ValueError("Input file must be a BAM or BED file")	×
365
366	# Set the thresholds
367	if options.low_threshold is None:	3✔
UNCOV 368	options.low_threshold = -options.threshold	×
369
370	if options.high_threshold is None:	3✔
371	options.high_threshold = options.threshold	×
372
373	# Now we can create the instance of GenomeCoverage
374	if "," in options.chromosome:	3✔
375	chrom_list = options.chromosome.split(",")	×
376	elif options.chromosome:	3✔
UNCOV 377	chrom_list = [options.chromosome]	×
378	else:
379	chrom_list = []	3✔
380
381	# initialisation (reading BED to get positions of chromosomes and chromosome names
382	gc = SequanaCoverage(	3✔
383	bedfile,
384	options.annotation,
385	options.low_threshold,
386	options.high_threshold,
387	options.double_threshold,
388	options.double_threshold,
389	chunksize=options.chunksize,
390	chromosome_list=chrom_list,
391	reference_file=options.reference,
392	gc_window_size=options.w_gc,
393	)
394
395	# some information fo end-users,
396	logger.info("There are %s chromosomes/contigs." % len(gc))	3✔
397	for this in gc.chrom_names:	3✔
398	end = gc.positions[this]["end"]	3✔
399	start = gc.positions[this]["start"]	3✔
400	data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)	3✔
401	logger.info(" {} (starting pos: {}, ending pos: {}, length: {})".format(*data))	3✔
402
403	# here we read chromosome by chromosome to save memory and analyse them.
404	N = len(gc)	3✔
405	for i, chrom in enumerate(gc.chrom_names[::-1]):	3✔
406	logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")	3✔
407	# since we read just one contig/chromosome, the chr_list contains
408	# only one contig, so we access to it with index 0
409
410	# Performs the computation and reporting for a given chromosome
411	# This call performs the analysis, and creates the HTML page
412	# if HTML is creates, it fills the gc._html_list variable
413	chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)	3✔
414	run_analysis(chrom_data, options)	3✔
415
416	del chrom_data # free memory for sure	3✔
417	garbage.collect()	3✔
418
419	# logging level seems to be reset to warning somewhere
420	logger.setLevel(options.logging_level)	3✔
421
422
423	CoverageModule(gc)	3✔
424
425	if options.skip_multiqc is False:	3✔
426	logger.info("Creating multiqc report")	3✔
427	pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")	3✔
428	cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "	3✔
429
430	proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)	3✔
431	proc.wait()	3✔
432
433	teardown(options.output_directory)	3✔
434
435
436	def run_analysis(chrom, options):	3✔
437	logger.info("Computing some metrics")	3✔
438
439	if options.w_median > len(chrom) / 4:	3✔
UNCOV 440	NW = int(len(chrom) / 4)	×
UNCOV 441	if NW % 2 == 0:	×
UNCOV 442	NW += 1	×
UNCOV 443	logger.warning(	×
444	"median window length is too long. \n"
445	" Setting the window length automatically to a fifth of\n"
446	" the chromosome length ({})".format(NW)
447	)
448	else:
449	NW = options.w_median	3✔
450
451	######################### DEFINES OUTPUT DIR AND SAMPLE NAME ###########
452	config.output_dir = options.output_directory	3✔
453	config.sample_name = os.path.basename(options.input).split(".")[0]	3✔
454
455	directory = options.output_directory	3✔
456	os.makedirs(directory, exist_ok=True)	3✔
457	directory = f"{options.output_directory}/{chrom.chrom_name}"	3✔
458	os.makedirs(directory, exist_ok=True)	3✔
459	#########################################################################
460
461	# compute the running median, zscore and ROIs for each chunk summarizing the
462	# results in a ChromosomeCovMultiChunk instane
463	logger.info("Using running median (w=%s)" % NW)	3✔
464	logger.info("Number of mixture models %s " % options.k)	3✔
465	results = chrom.run(	3✔
466	NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
467	)
468	chrom.plot_coverage(f"{directory}/coverage.png")	3✔
469
470	if chrom.DOC < 8:	3✔
UNCOV 471	logger.warning(	×
472	"The depth of coverage is below 8. sequana_coverage is"
473	" not optimised for such depth. You may want to "
474	" increase the threshold to avoid too many false detections"
475	)
476	logger.info(chrom.__str__())	3✔
477
478	# save summary and metrics
479	summary = results.get_summary(caller="sequana_coverage")	3✔
480	summary.to_json(f"{directory}/sequana_summary_coverage.json")	3✔
481
482	mu = summary.data.get("fit_mu", 0)	3✔
483	sigma = summary.data.get("fit_sigma", 0)	3✔
484	pi = summary.data.get("pi", 0)	3✔
485
486	logger.info(	3✔
487	"Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
488	% (round(mu, 3), round(sigma, 3), round(pi, 3))
489	)
490
491	# some information about the ROIs found
492	logger.info(	3✔
493	"Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
494	chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
495	)
496	)
497	ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instance	3✔
498	logger.info("Number of ROIs found: {}".format(len(ROIs.df)))	3✔
499	logger.info(" - below average: {}".format(len(ROIs.get_low_rois())))	3✔
500	logger.info(" - above average: {}".format(len(ROIs.get_high_rois())))	3✔
501
502	# Create directory and save ROIs
503	ROIs.df.to_csv(f"{directory}/rois.csv")	3✔
504
505
506	logger.info(f"Creating report in {options.output_directory}. Please wait")	3✔
507
508	if chrom._mode == "chunks":	3✔
UNCOV 509	logger.warning("This chromosome is large. Plots in the HTML reports are skipped")	×
510
511	ChromosomeCoverageModule(	3✔
512	chrom,
513	datatable=CoverageModule.init_roi_datatable(ROIs),
514	options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
515	command=" ".join(["sequana_coverage"] + sys.argv[1:]),
516	)
517
518
519	if __name__ == "__main__": # pragma: no cover
520	main()

cokelaer / sequana / 7093643877

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous