7117316673

Committed 06 Dec 2023 04:24PM UTC coverage: 75.413% (-0.07%) from 75.482%

Build # 7117316673

Build Type

push

github

Committed by

cokelaer

Commit Message

Update all codes to fulfill pre-commit hooks

Run Details

392 of 456 new or added lines in 116 files covered. (85.96%)

1819 existing lines in 87 files now uncovered.

13680 of 18140 relevant lines covered (75.41%)

1.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/sequana/scripts/substractor.py

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016 - Sequana Development Team
#
#  File author(s):
#    Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Substract genomes from the raw reads"""
import argparse
import glob
import os
import subprocess
import sys
from subprocess import STDOUT

import colorlog
from easydev.console import purple

from sequana import FastQ, logger
from sequana.bamtools import SAM

logger = colorlog.getLogger(__name__)


class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
    pass


epilog = purple(
    """
----

AUTHORS: Thomas Cokelaer
Documentation: http://sequana.readthedocs.io
Issues: http://github.com/sequana/sequana
        """
)


class Options(argparse.ArgumentParser):
    def __init__(self, prog="sequana_substractor"):
        usage = (
            """%s reads (flag 256+4) saving the mapped reads in a file, and the unmapped in
another file\n"""
            % prog
        )
        usage += """usage2: %s --input test.fastq --reference Phix174.fa\n""" % prog
        usage += """

        """
        super(Options, self).__init__(usage=usage, prog=prog, epilog=epilog, formatter_class=CustomFormatter)

        self.add_argument("--input", dest="input", type=str, required=True, help="input FastQ file")
        self.add_argument("--output", dest="outfile", type=str, default="reads.fastq", help="output FastQ filename")

        self.add_argument("--reference", dest="reference", type=str, default=None)
        self.add_argument("--references", dest="references", type=str, nargs="+", default=[])

        self.add_argument(
            "--output-directory",
            dest="outdir",
            type=str,
            default="sequana_substractor",
            required=False,
            help="input fastq gzipped or not",
        )
        self.add_argument(
            "--mapper",
            dest="mapper",
            type=str,
            default="minimap2",
            choices=["bwa", "minimap2"],
            required=False,
            help="mapper minimap2 or bwa",
        )

        self.add_argument("--version", dest="version", action="store_true", help="print version")
        self.add_argument("--verbose", dest="verbose", action="store_true", help="set verbosity on")
        self.add_argument("--quiet", dest="verbose", action="store_false", help="set verbosity off")
        self.add_argument("--threads", dest="threads", type=int, default=4, help="threading")


class Substractor(object):
    def __init__(self, infile, references, outdir, mapper, threads=4):
        self.infile = infile
        self.references = references

        self.outdir = outdir
        self.threads = threads

        if os.path.exists(outdir):
            logger.info("using {} for output".format(outdir))
        else:
            os.mkdir(outdir)

        # this may be used later on for other mapper or methodology
        if mapper == "minimap2":
            self.mapper_cmd = "minimap2 -x map-pb -t {} {} {} -a > {}"
        elif mapper == "bwa":
            self.mapper_cmd = "bwa mem -M -t {} {} {} > {}"

        f = FastQ(self.infile)
        self.L = len(f)
        logger.info("Found {} reads in input FastQ file\n\n".format(self.L))

    def run(self, output_filename):
        MAPPED = 0
        # temporary directory
        for i, reference in enumerate(self.references):
            if i == 0:
                infile = self.infile
            else:
                infile = outfile

            # we only accept reference ending in .fa or .fasta
            assert reference.endswith(".fa") or reference.endswith(".fasta")

            # keep only the basename
            outfile = os.path.basename(reference)
            outfile = outfile.replace(".fa", "").replace(".fasta", "")
            tag = outfile[0:8]
            outfile = "{}/mapping_{}.sam".format(self.outdir, tag)

            cmd = self.mapper_cmd.format(self.threads, reference, infile, outfile)

            # Now we need to extract the fastq from the SAM file.

            logger.info("Removing {}. Mapping starting".format(reference))
            logger.info(cmd)
            from subprocess import PIPE

            process = subprocess.call(cmd, shell=True, stderr=PIPE)

            results = self.splitter_mapped_unmapped(outfile, tag)

            # keep track of total mapped reads
            MAPPED += results["mapped"]

            outfile = "{}/{}.unmapped.fastq".format(self.outdir, tag)

            logger.info("{} mapped. {} reads remaining".format(results["mapped"], results["unmapped"]))
            print()

        # now we copy the last unmapped file into reads.fastq
        cmd = "cp {} {}".format(outfile, output_filename)
        process = subprocess.call(cmd, shell=True)

        logger.info("Your final file: {} with {} reads".format(output_filename, results["unmapped"]))

        logger.info("all mapped and unmapped files: {}. Input was {}".format(MAPPED + results["unmapped"], self.L))

    def splitter_mapped_unmapped(self, filename, prefix):
        # helpful resources:
        # https://broadinstitute.github.io/picard/explain-flags.html
        logger.info("Creating 2 files (mapped and unmapped reads)")
        data = SAM(filename)

        results = {"flags": [], "mapped": 0, "unmapped": 0, "bad": 0}
        logger.info("Please wait while creating output files")

        with open("{}/{}.unmapped.fastq".format(self.outdir, prefix), "w") as fnosirv:
            with open("{}/{}.mapped.fastq".format(self.outdir, prefix), "w") as fsirv:
                for a in data:
                    if a.flag & 2048:  # suppl
                        # a bad read, we can just drop it
                        results["bad"] += 1
                    elif a.flag & 1024:  # PCR duplicate
                        results["bad"] += 1
                    elif a.flag & 256:  # secondary alignment
                        results["bad"] += 1
                    elif a.flag & 16:  # mapped
                        read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)
                        assert len(a.query_sequence) == len(a.qual)
                        fsirv.write(read)
                        results["mapped"] += 1
                    elif a.flag & 4:  # unmapped
                        read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)
                        assert len(a.query_sequence) == len(a.qual)
                        fnosirv.write(read)
                        results["unmapped"] += 1
                    elif a.flag == 0:  # mapped
                        read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)
                        assert len(a.query_sequence) == len(a.qual)
                        fsirv.write(read)
                        results["mapped"] += 1
                    else:
                        logger.warning("{} flag not handled".format(a.flag))
                    results["flags"].append(a.flag)
        return results


def main(args=None):
    if args is None:
        args = sys.argv[:]

    print(purple("Welcome to sequana_substractor"))
    print(purple("WARNING. TESTED ON LONG READS ONLY. EXPERIMENTAL"))
    user_options = Options(prog="sequana_substractor")
    if len(args) == 1:
        args.append("--help")

    if "--version" in sys.argv:
        import sequana

        print(sequana.version)
        sys.exit(0)

    options = user_options.parse_args(args[1:])
    logger.setLevel(options.level)

    # build the references list
    references = []
    if options.reference:
        references.append(options.reference)
    if options.references:
        references = options.references
    options.references = references

    references = []
    # expand globs if any
    for ref in options.references:
        references.extend(glob.glob(ref))

    logger.info("{} references provided: {}".format(len(references), ",".join(references)))

    # call the entire machinery here
    sub = Substractor(options.input, references, options.outdir, options.mapper, options.threads)
    sub.run(options.outfile)


if __name__ == "__main__":
    import sys

    main(sys.argv)

1	#
2	# This file is part of Sequana software
3	#
4	# Copyright (c) 2016 - Sequana Development Team
5	#
6	# File author(s):
7	# Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
8	#
9	# Distributed under the terms of the 3-clause BSD license.
10	# The full license is in the LICENSE file, distributed with this software.
11	#
12	# website: https://github.com/sequana/sequana
13	# documentation: http://sequana.readthedocs.io
14	#
15	##############################################################################
16	"""Substract genomes from the raw reads"""	×
17	import argparse	×
18	import glob	×
NEW 19	import os	×
20	import subprocess	×
NEW 21	import sys	×
NEW 22	from subprocess import STDOUT	×
23
NEW 24	import colorlog	×
25	from easydev.console import purple	×
26
NEW 27	from sequana import FastQ, logger	×
28	from sequana.bamtools import SAM	×
29
30	logger = colorlog.getLogger(__name__)	×
31
32
UNCOV 33	class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):	×
UNCOV 34	pass	×
35
36
UNCOV 37	epilog = purple(	×
38	"""
39	----
40
41	AUTHORS: Thomas Cokelaer
42	Documentation: http://sequana.readthedocs.io
43	Issues: http://github.com/sequana/sequana
44	"""
45	)
46
47
48	class Options(argparse.ArgumentParser):	×
UNCOV 49	def __init__(self, prog="sequana_substractor"):	×
UNCOV 50	usage = (	×
51	"""%s reads (flag 256+4) saving the mapped reads in a file, and the unmapped in
52	another file\n"""
53	% prog
54	)
UNCOV 55	usage += """usage2: %s --input test.fastq --reference Phix174.fa\n""" % prog	×
UNCOV 56	usage += """	×
57
58	"""
59	super(Options, self).__init__(usage=usage, prog=prog, epilog=epilog, formatter_class=CustomFormatter)	×
60
UNCOV 61	self.add_argument("--input", dest="input", type=str, required=True, help="input FastQ file")	×
62	self.add_argument("--output", dest="outfile", type=str, default="reads.fastq", help="output FastQ filename")	×
63
UNCOV 64	self.add_argument("--reference", dest="reference", type=str, default=None)	×
65	self.add_argument("--references", dest="references", type=str, nargs="+", default=[])	×
66
UNCOV 67	self.add_argument(	×
68	"--output-directory",
69	dest="outdir",
70	type=str,
71	default="sequana_substractor",
72	required=False,
73	help="input fastq gzipped or not",
74	)
UNCOV 75	self.add_argument(	×
76	"--mapper",
77	dest="mapper",
78	type=str,
79	default="minimap2",
80	choices=["bwa", "minimap2"],
81	required=False,
82	help="mapper minimap2 or bwa",
83	)
84
85	self.add_argument("--version", dest="version", action="store_true", help="print version")	×
86	self.add_argument("--verbose", dest="verbose", action="store_true", help="set verbosity on")	×
UNCOV 87	self.add_argument("--quiet", dest="verbose", action="store_false", help="set verbosity off")	×
UNCOV 88	self.add_argument("--threads", dest="threads", type=int, default=4, help="threading")	×
89
90
91	class Substractor(object):	×
92	def __init__(self, infile, references, outdir, mapper, threads=4):	×
UNCOV 93	self.infile = infile	×
94	self.references = references	×
95
UNCOV 96	self.outdir = outdir	×
97	self.threads = threads	×
98
UNCOV 99	if os.path.exists(outdir):	×
100	logger.info("using {} for output".format(outdir))	×
101	else:
UNCOV 102	os.mkdir(outdir)	×
103
104	# this may be used later on for other mapper or methodology
105	if mapper == "minimap2":	×
106	self.mapper_cmd = "minimap2 -x map-pb -t {} {} {} -a > {}"	×
UNCOV 107	elif mapper == "bwa":	×
108	self.mapper_cmd = "bwa mem -M -t {} {} {} > {}"	×
109
110	f = FastQ(self.infile)	×
UNCOV 111	self.L = len(f)	×
112	logger.info("Found {} reads in input FastQ file\n\n".format(self.L))	×
113
UNCOV 114	def run(self, output_filename):	×
115	MAPPED = 0	×
116	# temporary directory
117	for i, reference in enumerate(self.references):	×
UNCOV 118	if i == 0:	×
119	infile = self.infile	×
120	else:
UNCOV 121	infile = outfile	×
122
123	# we only accept reference ending in .fa or .fasta
UNCOV 124	assert reference.endswith(".fa") or reference.endswith(".fasta")	×
125
126	# keep only the basename
127	outfile = os.path.basename(reference)	×
128	outfile = outfile.replace(".fa", "").replace(".fasta", "")	×
UNCOV 129	tag = outfile[0:8]	×
130	outfile = "{}/mapping_{}.sam".format(self.outdir, tag)	×
131
UNCOV 132	cmd = self.mapper_cmd.format(self.threads, reference, infile, outfile)	×
133
134	# Now we need to extract the fastq from the SAM file.
135
136	logger.info("Removing {}. Mapping starting".format(reference))	×
UNCOV 137	logger.info(cmd)	×
138	from subprocess import PIPE	×
139
140	process = subprocess.call(cmd, shell=True, stderr=PIPE)	×
141
UNCOV 142	results = self.splitter_mapped_unmapped(outfile, tag)	×
143
144	# keep track of total mapped reads
145	MAPPED += results["mapped"]	×
146
147	outfile = "{}/{}.unmapped.fastq".format(self.outdir, tag)	×
148
UNCOV 149	logger.info("{} mapped. {} reads remaining".format(results["mapped"], results["unmapped"]))	×
UNCOV 150	print()	×
151
152	# now we copy the last unmapped file into reads.fastq
UNCOV 153	cmd = "cp {} {}".format(outfile, output_filename)	×
154	process = subprocess.call(cmd, shell=True)	×
155
156	logger.info("Your final file: {} with {} reads".format(output_filename, results["unmapped"]))	×
157
158	logger.info("all mapped and unmapped files: {}. Input was {}".format(MAPPED + results["unmapped"], self.L))	×
159
UNCOV 160	def splitter_mapped_unmapped(self, filename, prefix):	×
161	# helpful resources:
162	# https://broadinstitute.github.io/picard/explain-flags.html
UNCOV 163	logger.info("Creating 2 files (mapped and unmapped reads)")	×
164	data = SAM(filename)	×
165
UNCOV 166	results = {"flags": [], "mapped": 0, "unmapped": 0, "bad": 0}	×
167	logger.info("Please wait while creating output files")	×
168
169	with open("{}/{}.unmapped.fastq".format(self.outdir, prefix), "w") as fnosirv:	×
170	with open("{}/{}.mapped.fastq".format(self.outdir, prefix), "w") as fsirv:	×
UNCOV 171	for a in data:	×
172	if a.flag & 2048: # suppl	×
173	# a bad read, we can just drop it
174	results["bad"] += 1	×
175	elif a.flag & 1024: # PCR duplicate	×
176	results["bad"] += 1	×
177	elif a.flag & 256: # secondary alignment	×
178	results["bad"] += 1	×
179	elif a.flag & 16: # mapped	×
180	read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)	×
181	assert len(a.query_sequence) == len(a.qual)	×
182	fsirv.write(read)	×
183	results["mapped"] += 1	×
184	elif a.flag & 4: # unmapped	×
185	read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)	×
186	assert len(a.query_sequence) == len(a.qual)	×
187	fnosirv.write(read)	×
188	results["unmapped"] += 1	×
189	elif a.flag == 0: # mapped	×
190	read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)	×
191	assert len(a.query_sequence) == len(a.qual)	×
UNCOV 192	fsirv.write(read)	×
193	results["mapped"] += 1	×
194	else:
195	logger.warning("{} flag not handled".format(a.flag))	×
UNCOV 196	results["flags"].append(a.flag)	×
UNCOV 197	return results	×
198
199
200	def main(args=None):	×
UNCOV 201	if args is None:	×
202	args = sys.argv[:]	×
203
204	print(purple("Welcome to sequana_substractor"))	×
205	print(purple("WARNING. TESTED ON LONG READS ONLY. EXPERIMENTAL"))	×
206	user_options = Options(prog="sequana_substractor")	×
UNCOV 207	if len(args) == 1:	×
208	args.append("--help")	×
209
UNCOV 210	if "--version" in sys.argv:	×
211	import sequana	×
212
UNCOV 213	print(sequana.version)	×
214	sys.exit(0)	×
215
UNCOV 216	options = user_options.parse_args(args[1:])	×
UNCOV 217	logger.setLevel(options.level)	×
218
219	# build the references list
220	references = []	×
221	if options.reference:	×
222	references.append(options.reference)	×
223	if options.references:	×
UNCOV 224	references = options.references	×
225	options.references = references	×
226
227	references = []	×
228	# expand globs if any
UNCOV 229	for ref in options.references:	×
230	references.extend(glob.glob(ref))	×
231
UNCOV 232	logger.info("{} references provided: {}".format(len(references), ",".join(references)))	×
233
234	# call the entire machinery here
UNCOV 235	sub = Substractor(options.input, references, options.outdir, options.mapper, options.threads)	×
UNCOV 236	sub.run(options.outfile)	×
237
238
UNCOV 239	if __name__ == "__main__":	×
240	import sys	×
241
UNCOV 242	main(sys.argv)	×

cokelaer / sequana / 7117316673

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous