• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cokelaer / sequana / 7117316673

06 Dec 2023 04:24PM UTC coverage: 75.413% (-0.07%) from 75.482%
7117316673

push

github

cokelaer
Update all codes to fulfill pre-commit hooks

392 of 456 new or added lines in 116 files covered. (85.96%)

1819 existing lines in 87 files now uncovered.

13680 of 18140 relevant lines covered (75.41%)

1.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/sequana/scripts/substractor.py
1
#
2
#  This file is part of Sequana software
3
#
4
#  Copyright (c) 2016 - Sequana Development Team
5
#
6
#  File author(s):
7
#    Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
8
#
9
#  Distributed under the terms of the 3-clause BSD license.
10
#  The full license is in the LICENSE file, distributed with this software.
11
#
12
#  website: https://github.com/sequana/sequana
13
#  documentation: http://sequana.readthedocs.io
14
#
15
##############################################################################
16
"""Substract genomes from the raw reads"""
×
17
import argparse
×
18
import glob
×
NEW
19
import os
×
20
import subprocess
×
NEW
21
import sys
×
NEW
22
from subprocess import STDOUT
×
23

NEW
24
import colorlog
×
25
from easydev.console import purple
×
26

NEW
27
from sequana import FastQ, logger
×
28
from sequana.bamtools import SAM
×
29

30
logger = colorlog.getLogger(__name__)
×
31

32

UNCOV
33
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
×
UNCOV
34
    pass
×
35

36

UNCOV
37
epilog = purple(
×
38
    """
39
----
40

41
AUTHORS: Thomas Cokelaer
42
Documentation: http://sequana.readthedocs.io
43
Issues: http://github.com/sequana/sequana
44
        """
45
)
46

47

48
class Options(argparse.ArgumentParser):
×
UNCOV
49
    def __init__(self, prog="sequana_substractor"):
×
UNCOV
50
        usage = (
×
51
            """%s reads (flag 256+4) saving the mapped reads in a file, and the unmapped in
52
another file\n"""
53
            % prog
54
        )
UNCOV
55
        usage += """usage2: %s --input test.fastq --reference Phix174.fa\n""" % prog
×
UNCOV
56
        usage += """
×
57

58
        """
59
        super(Options, self).__init__(usage=usage, prog=prog, epilog=epilog, formatter_class=CustomFormatter)
×
60

UNCOV
61
        self.add_argument("--input", dest="input", type=str, required=True, help="input FastQ file")
×
62
        self.add_argument("--output", dest="outfile", type=str, default="reads.fastq", help="output FastQ filename")
×
63

UNCOV
64
        self.add_argument("--reference", dest="reference", type=str, default=None)
×
65
        self.add_argument("--references", dest="references", type=str, nargs="+", default=[])
×
66

UNCOV
67
        self.add_argument(
×
68
            "--output-directory",
69
            dest="outdir",
70
            type=str,
71
            default="sequana_substractor",
72
            required=False,
73
            help="input fastq gzipped or not",
74
        )
UNCOV
75
        self.add_argument(
×
76
            "--mapper",
77
            dest="mapper",
78
            type=str,
79
            default="minimap2",
80
            choices=["bwa", "minimap2"],
81
            required=False,
82
            help="mapper minimap2 or bwa",
83
        )
84

85
        self.add_argument("--version", dest="version", action="store_true", help="print version")
×
86
        self.add_argument("--verbose", dest="verbose", action="store_true", help="set verbosity on")
×
UNCOV
87
        self.add_argument("--quiet", dest="verbose", action="store_false", help="set verbosity off")
×
UNCOV
88
        self.add_argument("--threads", dest="threads", type=int, default=4, help="threading")
×
89

90

91
class Substractor(object):
×
92
    def __init__(self, infile, references, outdir, mapper, threads=4):
×
UNCOV
93
        self.infile = infile
×
94
        self.references = references
×
95

UNCOV
96
        self.outdir = outdir
×
97
        self.threads = threads
×
98

UNCOV
99
        if os.path.exists(outdir):
×
100
            logger.info("using {} for output".format(outdir))
×
101
        else:
UNCOV
102
            os.mkdir(outdir)
×
103

104
        # this may be used later on for other mapper or methodology
105
        if mapper == "minimap2":
×
106
            self.mapper_cmd = "minimap2 -x map-pb -t {} {} {} -a > {}"
×
UNCOV
107
        elif mapper == "bwa":
×
108
            self.mapper_cmd = "bwa mem -M -t {} {} {} > {}"
×
109

110
        f = FastQ(self.infile)
×
UNCOV
111
        self.L = len(f)
×
112
        logger.info("Found {} reads in input FastQ file\n\n".format(self.L))
×
113

UNCOV
114
    def run(self, output_filename):
×
115
        MAPPED = 0
×
116
        # temporary directory
117
        for i, reference in enumerate(self.references):
×
UNCOV
118
            if i == 0:
×
119
                infile = self.infile
×
120
            else:
UNCOV
121
                infile = outfile
×
122

123
            # we only accept reference ending in .fa or .fasta
UNCOV
124
            assert reference.endswith(".fa") or reference.endswith(".fasta")
×
125

126
            # keep only the basename
127
            outfile = os.path.basename(reference)
×
128
            outfile = outfile.replace(".fa", "").replace(".fasta", "")
×
UNCOV
129
            tag = outfile[0:8]
×
130
            outfile = "{}/mapping_{}.sam".format(self.outdir, tag)
×
131

UNCOV
132
            cmd = self.mapper_cmd.format(self.threads, reference, infile, outfile)
×
133

134
            # Now we need to extract the fastq from the SAM file.
135

136
            logger.info("Removing {}. Mapping starting".format(reference))
×
UNCOV
137
            logger.info(cmd)
×
138
            from subprocess import PIPE
×
139

140
            process = subprocess.call(cmd, shell=True, stderr=PIPE)
×
141

UNCOV
142
            results = self.splitter_mapped_unmapped(outfile, tag)
×
143

144
            # keep track of total mapped reads
145
            MAPPED += results["mapped"]
×
146

147
            outfile = "{}/{}.unmapped.fastq".format(self.outdir, tag)
×
148

UNCOV
149
            logger.info("{} mapped. {} reads remaining".format(results["mapped"], results["unmapped"]))
×
UNCOV
150
            print()
×
151

152
        # now we copy the last unmapped file into reads.fastq
UNCOV
153
        cmd = "cp {} {}".format(outfile, output_filename)
×
154
        process = subprocess.call(cmd, shell=True)
×
155

156
        logger.info("Your final file: {} with {} reads".format(output_filename, results["unmapped"]))
×
157

158
        logger.info("all mapped and unmapped files: {}. Input was {}".format(MAPPED + results["unmapped"], self.L))
×
159

UNCOV
160
    def splitter_mapped_unmapped(self, filename, prefix):
×
161
        # helpful resources:
162
        # https://broadinstitute.github.io/picard/explain-flags.html
UNCOV
163
        logger.info("Creating 2 files (mapped and unmapped reads)")
×
164
        data = SAM(filename)
×
165

UNCOV
166
        results = {"flags": [], "mapped": 0, "unmapped": 0, "bad": 0}
×
167
        logger.info("Please wait while creating output files")
×
168

169
        with open("{}/{}.unmapped.fastq".format(self.outdir, prefix), "w") as fnosirv:
×
170
            with open("{}/{}.mapped.fastq".format(self.outdir, prefix), "w") as fsirv:
×
UNCOV
171
                for a in data:
×
172
                    if a.flag & 2048:  # suppl
×
173
                        # a bad read, we can just drop it
174
                        results["bad"] += 1
×
175
                    elif a.flag & 1024:  # PCR duplicate
×
176
                        results["bad"] += 1
×
177
                    elif a.flag & 256:  # secondary alignment
×
178
                        results["bad"] += 1
×
179
                    elif a.flag & 16:  # mapped
×
180
                        read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)
×
181
                        assert len(a.query_sequence) == len(a.qual)
×
182
                        fsirv.write(read)
×
183
                        results["mapped"] += 1
×
184
                    elif a.flag & 4:  # unmapped
×
185
                        read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)
×
186
                        assert len(a.query_sequence) == len(a.qual)
×
187
                        fnosirv.write(read)
×
188
                        results["unmapped"] += 1
×
189
                    elif a.flag == 0:  # mapped
×
190
                        read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual)
×
191
                        assert len(a.query_sequence) == len(a.qual)
×
UNCOV
192
                        fsirv.write(read)
×
193
                        results["mapped"] += 1
×
194
                    else:
195
                        logger.warning("{} flag not handled".format(a.flag))
×
UNCOV
196
                    results["flags"].append(a.flag)
×
UNCOV
197
        return results
×
198

199

200
def main(args=None):
×
UNCOV
201
    if args is None:
×
202
        args = sys.argv[:]
×
203

204
    print(purple("Welcome to sequana_substractor"))
×
205
    print(purple("WARNING. TESTED ON LONG READS ONLY. EXPERIMENTAL"))
×
206
    user_options = Options(prog="sequana_substractor")
×
UNCOV
207
    if len(args) == 1:
×
208
        args.append("--help")
×
209

UNCOV
210
    if "--version" in sys.argv:
×
211
        import sequana
×
212

UNCOV
213
        print(sequana.version)
×
214
        sys.exit(0)
×
215

UNCOV
216
    options = user_options.parse_args(args[1:])
×
UNCOV
217
    logger.setLevel(options.level)
×
218

219
    # build the references list
220
    references = []
×
221
    if options.reference:
×
222
        references.append(options.reference)
×
223
    if options.references:
×
UNCOV
224
        references = options.references
×
225
    options.references = references
×
226

227
    references = []
×
228
    # expand globs if any
UNCOV
229
    for ref in options.references:
×
230
        references.extend(glob.glob(ref))
×
231

UNCOV
232
    logger.info("{} references provided: {}".format(len(references), ",".join(references)))
×
233

234
    # call the entire machinery here
UNCOV
235
    sub = Substractor(options.input, references, options.outdir, options.mapper, options.threads)
×
UNCOV
236
    sub.run(options.outfile)
×
237

238

UNCOV
239
if __name__ == "__main__":
×
240
    import sys
×
241

UNCOV
242
    main(sys.argv)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc