6851111417

Committed 13 Nov 2023 02:09PM UTC coverage: 73.547%. Remained the same

Build # 6851111417

Build Type

push

github

Committed by

cokelaer

Commit Message

Refactorised the coverage tools (sequana_coverage)

Run Details

160 of 296 new or added lines in 27 files covered. (54.05%)

496 existing lines in 25 files now uncovered.

13245 of 18009 relevant lines covered (73.55%)

2.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.88

/sequana/scripts/coverage.py

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
""".. rubric:: Standalone application dedicated to coverage"""
import argparse
import gc as garbage
import glob
import os
import subprocess
import sys

import rich_click as click

click.rich_click.USE_MARKDOWN = True
click.rich_click.SHOW_METAVARS_COLUMN = False
click.rich_click.APPEND_METAVARS_HELP = True
click.rich_click.STYLE_ERRORS_SUGGESTION = "magenta italic"
click.rich_click.SHOW_ARGUMENTS = True


import colorlog
from easydev import AttrDict, shellcmd

from sequana import sequana_data
from sequana import version as sequana_version
from sequana.bedtools import ChromosomeCov, SequanaCoverage
from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule
from sequana.utils import config
from sequana.scripts.common import teardown

logger = colorlog.getLogger(__name__)


CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


click.rich_click.OPTION_GROUPS = {
    "sequana_coverage": [
        {
            "name": "Required",
            "options": ["--input-file"],
        },
        {
            "name": "Detection Algorithm Options",
            "options": [
                "--circular",
                "--mixture-models",
                "--window-median",
                "--window-gc",
                "--low-threshold",
                "--high-threshold",
                "--threshold",
                "--cnv-clustering",
                "--clustering-parameter",
            ],
        },
        {
            "name": "Modifiers",
            "options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
        },
        {
            "name": "Download utilities",
            "options": ["--download-reference", "--download-genbank", "--database"],
        },
        {
            "name": "Output files",
            "options": ["--no-multiqc", "--output-directory"],
        },
        {
            "name": "Behaviour",
            "options": ["--version", "--level", "--debug-level", "--help"],
        },
    ],
}


def download_reference(ctx, param, value):
    if value:
        database = ctx.params.get("database", "ENA")
        click.echo(f"Downloading reference {value} from {database}...", nl=False)
        from bioservices.apps import download_fasta as df

        df.download_fasta(value, method="ENA")
        click.echo("done")
        sys.exit(0)
    return value


def download_genbank(ctx, param, value):
    if value:
        click.echo(f"Downloading genbank {value}...", nl=False)
        from sequana.snpeff import download_fasta_and_genbank

        download_fasta_and_genbank(value, value, genbank=True, fasta=False)
        click.echo("done")
        sys.exit(0)
    return value


@click.command(context_settings=CONTEXT_SETTINGS)
@click.option(
    "--database",
    "database",
    default="ENA",
    type=click.Choice(["ENA", "EUtils"]),
    is_eager=True,
    # expose_value=False,
    help="Download the reference from one of these database (default ENA)",
)
@click.option(
    "--download-reference",
    "download_reference",
    default=None,
    type=click.STRING,
    callback=download_reference,
    is_eager=True,
    # expose_value=False,
    help="Provide accession number to download a reference from NCBI/ENA",
)
@click.option(
    "--download-genbank",
    "download_genbank",
    default=None,
    type=click.STRING,
    callback=download_genbank,
    is_eager=True,
    help="Provide accession number to download annotation from NCBI/ENA",
)
@click.option(
    "-i",
    "--input-file",
    "input",
    type=click.Path(),
    required=True,
    help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
)
@click.option(
    "-a",
    "--annotation-file",
    "annotation",
    type=click.STRING,
    default=None,
    help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
)
@click.version_option(sequana_version)
@click.option(
    "-o",
    "--circular",
    is_flag=True,
    show_default=True,
    # help="""If the DNA of the organism is circular (typically
    #        viruses or bacteria), set to True"""
)
@click.option(
    "-c",
    "--chromosome",
    "chromosome",
    type=click.STRING,
    default="",
    help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
)
@click.option(
    "-k",
    "--mixture-models",
    "k",
    type=click.INT,
    help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
    default=2,
)
@click.option(
    "--debug-level",
    "logging_level",
    default="INFO",
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
)
@click.option(
    "--level",
    "logging_level",
    default="INFO",
    type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
    help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
)
@click.option(
    "-g",
    "--window-gc",
    "w_gc",
    type=click.INT,
    default=101,
    help="Length of the running window to compute the GC content",
)
@click.option(
    "--no-multiqc",
    "skip_multiqc",
    is_flag=True,
    show_default=True,
    help="Do not create any multiqc HTML page.",
)
#FIXME currently broken. requires complete refactoring of coverage module
#@click.option(
#    "skip_html",
#    is_flag=True,
#    show_default=False,
#    help="Do not create any HTML reports. Save ROIs and statistics only.",
#)
@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")
@click.option(
    "-r",
    "--reference-file",
    "reference",
    type=click.STRING,
    default=None,
    help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
)
@click.option(
    "-w",
    "--window-median",
    "w_median",
    type=click.IntRange(200),
    help="Length of the running median window (default 20,001, recommended for bacteria).  For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
    default=20001,
)
@click.option(
    "-L",
    "--low-threshold",
    "low_threshold",
    default=-4,
    type=click.FLOAT,
    help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
)
@click.option(
    "-H",
    "--high-threshold",
    "high_threshold",
    default=4,
    type=click.FLOAT,
    help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
)
@click.option(
    "-T",
    "--threshold",
    "threshold",
    default=4,
    type=click.FLOAT,
    help="set lower and higher thresholds of the confidence interval. ",
)
@click.option(
    "-C",
    "--clustering-parameter",
    "double_threshold",
    default=0.5,
    type=click.FLOAT,
    help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
)
@click.option(
    "-s",
    "--chunk-size",
    "chunksize",
    type=click.IntRange(1000000),  # 1e6
    default=10000000,  # 1e7
    help="Length of the chunk to be used for the analysis. ",
)
@click.option(
    "-B",
    "--binning",
    "binning",
    type=click.IntRange(2),
    default=None,
    help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
)
@click.option(
    "--cnv-clustering",
    "cnv_clustering",
    default=-1,
    type=click.INT,
    help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
)
def main(**kwargs):
    """Welcome to SEQUANA -- Coverage standalone

        ----

       Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
       file. In addition, the running median used in conjunction with double thresholds
       extract regions of interests (ROI) for low or high coverage. A reference may be
       provided to plot the coverage versus GC content. An annotation file may be provided
       to annotate the found ROIs.

       The input file should be one of the following:

       - a BED file that is a tabulated file at least 3 columns.
         The first column being the reference, the second is the position
         and the third column contains the coverage itself.
       - or a BAM file that is converted automatically
         into a BED file using the following command:

            bedtools genomecov -d -ibam FILE.bam

       Note that this is different from another command that could perform the conversion:

           samtools depth -aa input.bam > output.bed

       If the reference is provided, an additional plot showing the coverage versus
       GC content is also shown.

       Here are some examples

           sequana_coverage --input-file file.bed --window-median 1001
           sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>

       An other interesting option is to provide a BED file with 4 columns. The
       fourth column being another coverage data created with a filter. One can
       create such a file only from the BAM file using samtools as follows given
       the original unfiltered BAM file named input.bam:

           samtools view -q 35  -o data.filtered.bam input.bam
           samtools depth input.bam data.filtered.bam  -aa > test.bed
           sequana_coverage --input test.bed --show-html

       Note that the first file is the filtered one, and the second file is the
       unfiltered one.

       Note for multi chromosome and genbank features: for now, you will need to call
       sequana_coverage for each chromosome individually since we accept only one
       genbank as input parameter:

           sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1

       Large genomes:
       --------------

       If your input data is large and does not fit into memory, use the --binning BIN
       options to average data into bin of BIN values.

       CNV cases:
       --------------

       By default, sequana_coverage identify events as small as 1 bin. For the CNV
       detection case, you may want to cluster events. the --cnv-merging DELTA option
       merges consecutives events whose distance is smaller that DELTA


    ----

    AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
    Documentation: http://sequana.readthedocs.io
    Issues: http://github.com/sequana/sequana
    """
    options = AttrDict(**kwargs)

    logger.setLevel(options.logging_level)

    if options.annotation:
        assert os.path.exists(options.annotation), "%s does not exists" % options.annotation

    logger.info(f"Reading {options.input}. This may take time ")

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # Now we can create the instance of GenomeCoverage
    if "," in options.chromosome:
        chrom_list = options.chromosome.split(",")
    elif options.chromosome:
        chrom_list = [options.chromosome]
    else:
        chrom_list = []

    # initialisation (reading BED to get positions of chromosomes and chromosome names
    gc = SequanaCoverage(
        bedfile,
        options.annotation,
        options.low_threshold,
        options.high_threshold,
        options.double_threshold,
        options.double_threshold,
        chunksize=options.chunksize,
        chromosome_list=chrom_list,
        reference_file=options.reference,
        gc_window_size=options.w_gc,
    )

    # some information fo end-users,
    logger.info("There are %s chromosomes/contigs." % len(gc))
    for this in gc.chrom_names:
        end = gc.positions[this]["end"]
        start = gc.positions[this]["start"]
        data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)
        logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))

    # here we read chromosome by chromosome to save memory and analyse them.
    N = len(gc)
    for i, chrom in enumerate(gc.chrom_names[::-1]):
        logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")
        # since we read just one contig/chromosome, the chr_list contains
        # only one contig, so we access to it with index 0

        # Performs the computation and reporting for a given chromosome
        # This call performs the analysis, and creates the HTML page
        # if HTML is creates, it fills the gc._html_list variable
        chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)
        run_analysis(chrom_data, options)

        del chrom_data  # free memory for sure
        garbage.collect()

        # logging level seems to be reset to warning somewhere
        logger.setLevel(options.logging_level)

    # ugly but works for now. extra output_directory must be removed
    #html_list = list(glob.glob(f"{options.output_directory}/*/*.cov.html"))
    #html_list = [x.replace(f"{options.output_directory}/", "") for x in html_list]

    CoverageModule(gc)

    if options.skip_multiqc is False:
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "

        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()

    teardown(options.output_directory)


def run_analysis(chrom, options):
    logger.info("Computing some metrics")

    if options.w_median > len(chrom) / 4:
        NW = int(len(chrom) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning(
            "median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW)
        )
    else:
        NW = options.w_median

    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split(".")[0]

    directory = options.output_directory
    os.makedirs(directory, exist_ok=True)
    directory = f"{options.output_directory}/{chrom.chrom_name}"
    os.makedirs(directory, exist_ok=True)
    #########################################################################

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info("Using running median (w=%s)" % NW)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(
        NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
    )

    if chrom.DOC < 8:
        logger.warning(
            "The depth of coverage is below 8. sequana_coverage is"
            " not optimised for such depth. You may want to "
            " increase the threshold to avoid too many false detections"
        )
    logger.info(chrom.__str__())

    # save summary and metrics
    summary = results.get_summary(caller="sequana_coverage")
    summary.to_json(f"{directory}/sequana_summary_coverage.json")

    mu = summary.data.get("fit_mu", 0)
    sigma = summary.data.get("fit_sigma", 0)
    pi = summary.data.get("pi", 0)

    logger.info(
        "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
        % (round(mu, 3), round(sigma, 3), round(pi, 3))
        )

    # some information about the ROIs found
    logger.info(
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
            chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
        )
    )
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instance
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    ROIs.df.to_csv(f"{directory}/rois.csv")


    logger.info(f"Creating report in {options.output_directory}. Please wait")

    if chrom._mode == "chunks":
        logger.warning("This chromosome is large. Plots in the HTML reports are skipped")

    ChromosomeCoverageModule(
        chrom,
        datatable=CoverageModule.init_roi_datatable(ROIs),
        options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
        command=" ".join(["sequana_coverage"] + sys.argv[1:]), 
    )


if __name__ == "__main__":  # pragma: no cover
    main()

1	#
2	# This file is part of Sequana software
3	#
4	# Copyright (c) 2016-2022 - Sequana Development Team
5	#
6	# Distributed under the terms of the 3-clause BSD license.
7	# The full license is in the LICENSE file, distributed with this software.
8	#
9	# website: https://github.com/sequana/sequana
10	# documentation: http://sequana.readthedocs.io
11	#
12	##############################################################################
13	""".. rubric:: Standalone application dedicated to coverage"""	3✔
14	import argparse	3✔
15	import gc as garbage	3✔
16	import glob	3✔
17	import os	3✔
18	import subprocess	3✔
19	import sys	3✔
20
21	import rich_click as click	3✔
22
23	click.rich_click.USE_MARKDOWN = True	3✔
24	click.rich_click.SHOW_METAVARS_COLUMN = False	3✔
25	click.rich_click.APPEND_METAVARS_HELP = True	3✔
26	click.rich_click.STYLE_ERRORS_SUGGESTION = "magenta italic"	3✔
27	click.rich_click.SHOW_ARGUMENTS = True	3✔
28
29
30	import colorlog	3✔
31	from easydev import AttrDict, shellcmd	3✔
32
33	from sequana import sequana_data	3✔
34	from sequana import version as sequana_version	3✔
35	from sequana.bedtools import ChromosomeCov, SequanaCoverage	3✔
36	from sequana.modules_report.coverage import ChromosomeCoverageModule, CoverageModule	3✔
37	from sequana.utils import config	3✔
38	from sequana.scripts.common import teardown	3✔
39
40	logger = colorlog.getLogger(__name__)	3✔
41
42
43	CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])	3✔
44
45
46	click.rich_click.OPTION_GROUPS = {	3✔
47	"sequana_coverage": [
48	{
49	"name": "Required",
50	"options": ["--input-file"],
51	},
52	{
53	"name": "Detection Algorithm Options",
54	"options": [
55	"--circular",
56	"--mixture-models",
57	"--window-median",
58	"--window-gc",
59	"--low-threshold",
60	"--high-threshold",
61	"--threshold",
62	"--cnv-clustering",
63	"--clustering-parameter",
64	],
65	},
66	{
67	"name": "Modifiers",
68	"options": ["--annotation-file", "--reference-file", "--chromosome", "--chunk-size", "--binning"],
69	},
70	{
71	"name": "Download utilities",
72	"options": ["--download-reference", "--download-genbank", "--database"],
73	},
74	{
75	"name": "Output files",
76	"options": ["--no-multiqc", "--output-directory"],
77	},
78	{
79	"name": "Behaviour",
80	"options": ["--version", "--level", "--debug-level", "--help"],
81	},
82	],
83	}
84
85
86	def download_reference(ctx, param, value):	3✔
87	if value:	3✔
88	database = ctx.params.get("database", "ENA")	×
89	click.echo(f"Downloading reference {value} from {database}...", nl=False)	×
90	from bioservices.apps import download_fasta as df	×
91
92	df.download_fasta(value, method="ENA")	×
93	click.echo("done")	×
94	sys.exit(0)	×
95	return value	3✔
96
97
98	def download_genbank(ctx, param, value):	3✔
99	if value:	3✔
100	click.echo(f"Downloading genbank {value}...", nl=False)	×
101	from sequana.snpeff import download_fasta_and_genbank	×
102
103	download_fasta_and_genbank(value, value, genbank=True, fasta=False)	×
104	click.echo("done")	×
105	sys.exit(0)	×
106	return value	3✔
107
108
109	@click.command(context_settings=CONTEXT_SETTINGS)	3✔
110	@click.option(	3✔
111	"--database",
112	"database",
113	default="ENA",
114	type=click.Choice(["ENA", "EUtils"]),
115	is_eager=True,
116	# expose_value=False,
117	help="Download the reference from one of these database (default ENA)",
118	)
119	@click.option(	3✔
120	"--download-reference",
121	"download_reference",
122	default=None,
123	type=click.STRING,
124	callback=download_reference,
125	is_eager=True,
126	# expose_value=False,
127	help="Provide accession number to download a reference from NCBI/ENA",
128	)
129	@click.option(	3✔
130	"--download-genbank",
131	"download_genbank",
132	default=None,
133	type=click.STRING,
134	callback=download_genbank,
135	is_eager=True,
136	help="Provide accession number to download annotation from NCBI/ENA",
137	)
138	@click.option(	3✔
139	"-i",
140	"--input-file",
141	"input",
142	type=click.Path(),
143	required=True,
144	help="""Input file in BED or BAM format. If a BAM file is provided, it will be converted locally to a BED file using genomecov, which must be installed.""",
145	)
146	@click.option(	3✔
147	"-a",
148	"--annotation-file",
149	"annotation",
150	type=click.STRING,
151	default=None,
152	help="a valid gff3 or genbank annotation. Recommended for prokaryotes or small eukaryotes only.",
153	)
154	@click.version_option(sequana_version)	3✔
155	@click.option(	3✔
156	"-o",
157	"--circular",
158	is_flag=True,
159	show_default=True,
160	# help="""If the DNA of the organism is circular (typically
161	# viruses or bacteria), set to True"""
162	)
163	@click.option(	3✔
164	"-c",
165	"--chromosome",
166	"chromosome",
167	type=click.STRING,
168	default="",
169	help="By default all chromosomes are analysed. You may want to analyse only one" " by using this parameter",
170	)
171	@click.option(	3✔
172	"-k",
173	"--mixture-models",
174	"k",
175	type=click.INT,
176	help="Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value",
177	default=2,
178	)
179	@click.option(	3✔
180	"--debug-level",
181	"logging_level",
182	default="INFO",
183	type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
184	help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
185	)
186	@click.option(	3✔
187	"--level",
188	"logging_level",
189	default="INFO",
190	type=click.Choice(["DEBUG", "INFO", "WARNING", "CRITICAL", "ERROR"]),
191	help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR",
192	)
193	@click.option(	3✔
194	"-g",
195	"--window-gc",
196	"w_gc",
197	type=click.INT,
198	default=101,
199	help="Length of the running window to compute the GC content",
200	)
201	@click.option(	3✔
202	"--no-multiqc",
203	"skip_multiqc",
204	is_flag=True,
205	show_default=True,
206	help="Do not create any multiqc HTML page.",
207	)
208	#FIXME currently broken. requires complete refactoring of coverage module
209	#@click.option(
210	# "skip_html",
211	# is_flag=True,
212	# show_default=False,
213	# help="Do not create any HTML reports. Save ROIs and statistics only.",
214	#)
215	@click.option("--output-directory", "output_directory", default="report", help="name of the output (report) directory.")	3✔
216	@click.option(	3✔
217	"-r",
218	"--reference-file",
219	"reference",
220	type=click.STRING,
221	default=None,
222	help="If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image. Recommended for prokaryotes or small eukaryotes only.",
223	)
224	@click.option(	3✔
225	"-w",
226	"--window-median",
227	"w_median",
228	type=click.IntRange(200),
229	help="Length of the running median window (default 20,001, recommended for bacteria). For short genome (below 100000 bases), we set this parameter to one fifth of the genome length. minimal value is 125 (one fourth of 500bp that should be minimal length of contigs)",
230	default=20001,
231	)
232	@click.option(	3✔
233	"-L",
234	"--low-threshold",
235	"low_threshold",
236	default=-4,
237	type=click.FLOAT,
238	help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
239	)
240	@click.option(	3✔
241	"-H",
242	"--high-threshold",
243	"high_threshold",
244	default=4,
245	type=click.FLOAT,
246	help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T"),
247	)
248	@click.option(	3✔
249	"-T",
250	"--threshold",
251	"threshold",
252	default=4,
253	type=click.FLOAT,
254	help="set lower and higher thresholds of the confidence interval. ",
255	)
256	@click.option(	3✔
257	"-C",
258	"--clustering-parameter",
259	"double_threshold",
260	default=0.5,
261	type=click.FLOAT,
262	help="set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value",
263	)
264	@click.option(	3✔
265	"-s",
266	"--chunk-size",
267	"chunksize",
268	type=click.IntRange(1000000), # 1e6
269	default=10000000, # 1e7
270	help="Length of the chunk to be used for the analysis. ",
271	)
272	@click.option(	3✔
273	"-B",
274	"--binning",
275	"binning",
276	type=click.IntRange(2),
277	default=None,
278	help="merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.",
279	)
280	@click.option(	3✔
281	"--cnv-clustering",
282	"cnv_clustering",
283	default=-1,
284	type=click.INT,
285	help="Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. ",
286	)
287	def main(**kwargs):	3✔
288	"""Welcome to SEQUANA -- Coverage standalone
289
290	----
291
292	Extract and plot coverage of one or more chromosomes/contigs from a BED or BAM
293	file. In addition, the running median used in conjunction with double thresholds
294	extract regions of interests (ROI) for low or high coverage. A reference may be
295	provided to plot the coverage versus GC content. An annotation file may be provided
296	to annotate the found ROIs.
297
298	The input file should be one of the following:
299
300	- a BED file that is a tabulated file at least 3 columns.
301	The first column being the reference, the second is the position
302	and the third column contains the coverage itself.
303	- or a BAM file that is converted automatically
304	into a BED file using the following command:
305
306	bedtools genomecov -d -ibam FILE.bam
307
308	Note that this is different from another command that could perform the conversion:
309
310	samtools depth -aa input.bam > output.bed
311
312	If the reference is provided, an additional plot showing the coverage versus
313	GC content is also shown.
314
315	Here are some examples
316
317	sequana_coverage --input-file file.bed --window-median 1001
318	sequana_coverage --input-file file.bam --window-median 1001 -r <REFERENCE.fa>
319
320	An other interesting option is to provide a BED file with 4 columns. The
321	fourth column being another coverage data created with a filter. One can
322	create such a file only from the BAM file using samtools as follows given
323	the original unfiltered BAM file named input.bam:
324
325	samtools view -q 35 -o data.filtered.bam input.bam
326	samtools depth input.bam data.filtered.bam -aa > test.bed
327	sequana_coverage --input test.bed --show-html
328
329	Note that the first file is the filtered one, and the second file is the
330	unfiltered one.
331
332	Note for multi chromosome and genbank features: for now, you will need to call
333	sequana_coverage for each chromosome individually since we accept only one
334	genbank as input parameter:
335
336	sequana_coverage --input-file file.bed --genbank chrom1.gbk -c 1
337
338	Large genomes:
339	--------------
340
341	If your input data is large and does not fit into memory, use the --binning BIN
342	options to average data into bin of BIN values.
343
344	CNV cases:
345	--------------
346
347	By default, sequana_coverage identify events as small as 1 bin. For the CNV
348	detection case, you may want to cluster events. the --cnv-merging DELTA option
349	merges consecutives events whose distance is smaller that DELTA
350
351
352	----
353
354	AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
355	Documentation: http://sequana.readthedocs.io
356	Issues: http://github.com/sequana/sequana
357	"""
358	options = AttrDict(**kwargs)	3✔
359
360	logger.setLevel(options.logging_level)	3✔
361
362	if options.annotation:	3✔
363	assert os.path.exists(options.annotation), "%s does not exists" % options.annotation	3✔
364
365	logger.info(f"Reading {options.input}. This may take time ")	3✔
366
367	# Convert BAM to BED
368	if options.input.endswith(".bam"):	3✔
369	bedfile = options.input.replace(".bam", ".bed")	×
370	logger.info("Converting BAM into BED file")	×
371	shellcmd(f"bedtools genomecov -d -ibam {options.input} > {bedfile}")	×
372	elif options.input.endswith(".bed"):	3✔
373	bedfile = options.input	3✔
374	else:
375	raise ValueError("Input file must be a BAM or BED file")	×
376
377	# Set the thresholds
378	if options.low_threshold is None:	3✔
379	options.low_threshold = -options.threshold	×
380
381	if options.high_threshold is None:	3✔
382	options.high_threshold = options.threshold	×
383
384	# Now we can create the instance of GenomeCoverage
385	if "," in options.chromosome:	3✔
386	chrom_list = options.chromosome.split(",")	×
387	elif options.chromosome:	3✔
388	chrom_list = [options.chromosome]	×
389	else:
390	chrom_list = []	3✔
391
392	# initialisation (reading BED to get positions of chromosomes and chromosome names
393	gc = SequanaCoverage(	3✔
394	bedfile,
395	options.annotation,
396	options.low_threshold,
397	options.high_threshold,
398	options.double_threshold,
399	options.double_threshold,
400	chunksize=options.chunksize,
401	chromosome_list=chrom_list,
402	reference_file=options.reference,
403	gc_window_size=options.w_gc,
404	)
405
406	# some information fo end-users,
407	logger.info("There are %s chromosomes/contigs." % len(gc))	3✔
408	for this in gc.chrom_names:	3✔
409	end = gc.positions[this]["end"]	3✔
410	start = gc.positions[this]["start"]	3✔
411	data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end - start)	3✔
412	logger.info(" {} (starting pos: {}, ending pos: {}, length: {})".format(*data))	3✔
413
414	# here we read chromosome by chromosome to save memory and analyse them.
415	N = len(gc)	3✔
416	for i, chrom in enumerate(gc.chrom_names[::-1]):	3✔
417	logger.info(f"==================== analysing chrom/contig {i+1}/{N} ({chrom})")	3✔
418	# since we read just one contig/chromosome, the chr_list contains
419	# only one contig, so we access to it with index 0
420
421	# Performs the computation and reporting for a given chromosome
422	# This call performs the analysis, and creates the HTML page
423	# if HTML is creates, it fills the gc._html_list variable
424	chrom_data = ChromosomeCov(gc, chrom, gc.thresholds, gc.chunksize)	3✔
425	run_analysis(chrom_data, options)	3✔
426
427	del chrom_data # free memory for sure	3✔
428	garbage.collect()	3✔
429
430	# logging level seems to be reset to warning somewhere
431	logger.setLevel(options.logging_level)	3✔
432
433	# ugly but works for now. extra output_directory must be removed
434	#html_list = list(glob.glob(f"{options.output_directory}//.cov.html"))
435	#html_list = [x.replace(f"{options.output_directory}/", "") for x in html_list]
436
437	CoverageModule(gc)	3✔
438
439	if options.skip_multiqc is False:	3✔
440	logger.info("Creating multiqc report")	3✔
441	pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")	3✔
442	cmd = f"multiqc . -m sequana_coverage -f -c {pathtocfg} "	3✔
443
444	proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)	3✔
445	proc.wait()	3✔
446
447	teardown(options.output_directory)	3✔
448
449
450	def run_analysis(chrom, options):	3✔
451	logger.info("Computing some metrics")	3✔
452
453	if options.w_median > len(chrom) / 4:	3✔
UNCOV 454	NW = int(len(chrom) / 4)	×
UNCOV 455	if NW % 2 == 0:	×
UNCOV 456	NW += 1	×
457	logger.warning(	×
458	"median window length is too long. \n"
459	" Setting the window length automatically to a fifth of\n"
460	" the chromosome length ({})".format(NW)
461	)
462	else:
463	NW = options.w_median	3✔
464
465	######################### DEFINES OUTPUT DIR AND SAMPLE NAME ###########
466	config.output_dir = options.output_directory	3✔
467	config.sample_name = os.path.basename(options.input).split(".")[0]	3✔
468
469	directory = options.output_directory	3✔
470	os.makedirs(directory, exist_ok=True)	3✔
471	directory = f"{options.output_directory}/{chrom.chrom_name}"	3✔
472	os.makedirs(directory, exist_ok=True)	3✔
473	#########################################################################
474
475	# compute the running median, zscore and ROIs for each chunk summarizing the
476	# results in a ChromosomeCovMultiChunk instane
477	logger.info("Using running median (w=%s)" % NW)	3✔
478	logger.info("Number of mixture models %s " % options.k)	3✔
479	results = chrom.run(	3✔
480	NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering
481	)
482
483	if chrom.DOC < 8:	3✔
UNCOV 484	logger.warning(	×
485	"The depth of coverage is below 8. sequana_coverage is"
486	" not optimised for such depth. You may want to "
487	" increase the threshold to avoid too many false detections"
488	)
489	logger.info(chrom.__str__())	3✔
490
491	# save summary and metrics
492	summary = results.get_summary(caller="sequana_coverage")	3✔
493	summary.to_json(f"{directory}/sequana_summary_coverage.json")	3✔
494
495	mu = summary.data.get("fit_mu", 0)	3✔
496	sigma = summary.data.get("fit_sigma", 0)	3✔
497	pi = summary.data.get("pi", 0)	3✔
498
499	logger.info(	3✔
500	"Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
501	% (round(mu, 3), round(sigma, 3), round(pi, 3))
502	)
503
504	# some information about the ROIs found
505	logger.info(	3✔
506	"Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
507	chrom.thresholds.low, chrom.thresholds.high, chrom.thresholds.low2, chrom.thresholds.high2
508	)
509	)
510	ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instance	3✔
511	logger.info("Number of ROIs found: {}".format(len(ROIs.df)))	3✔
512	logger.info(" - below average: {}".format(len(ROIs.get_low_rois())))	3✔
513	logger.info(" - above average: {}".format(len(ROIs.get_high_rois())))	3✔
514
515	# Create directory and save ROIs
516	ROIs.df.to_csv(f"{directory}/rois.csv")	3✔
517
518
519	logger.info(f"Creating report in {options.output_directory}. Please wait")	3✔
520
521	if chrom._mode == "chunks":	3✔
UNCOV 522	logger.warning("This chromosome is large. Plots in the HTML reports are skipped")	×
523
524	ChromosomeCoverageModule(	3✔
525	chrom,
526	datatable=CoverageModule.init_roi_datatable(ROIs),
527	options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular},
528	command=" ".join(["sequana_coverage"] + sys.argv[1:]),
529	)
530
531
532	if __name__ == "__main__": # pragma: no cover
533	main()

cokelaer / sequana / 6851111417

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous