• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

broadinstitute / viral-core / 9844593225

10 Jun 2024 04:59PM UTC coverage: 65.885% (-0.01%) from 65.898%
9844593225

push

github

web-flow
index bam before calling samtools idxstats; warn user if input lacks reads (#102)

* index bam before calling samtools idxstats; warn user if input lacks reads

in `read_utils.py::minimap2_idxstats()`, warn the user if the input bam file lacks reads, and index the post-alignment post-filtering bam before calling `samtools idxstats` so `minimap2_idxstats()` succeeds if the input bam lacks reads. The minimap2 wrapper, `tools/minimap2.py::Minimap2::align_bam()`, tolerates empty input and indexes bam/cram output, but the post-filtering final bam in minimap2_idxstats was not being indexed, which led to a problematic exit-on-failure condition in a WDL command invoking the function. Also fix import of the InvalidBamHeaderError class used in several tool modules (add it to errors.py, and import the error subclasses where needed).
This also runs minimap2 alignment when a read group lacks reads, since the newer version of minimap2 seems to better tolerate empty RGs. This allows samtools idxstats to yield zeros across the board for all input sequences when no reads are present, rather than only emitting the catchall "*" (which can cause issues downstream where metrics are expected)

* add pandas to deps

* attempt to fix coverage reporting due to breaking changes in coveralls reporter

see:
https://github.com/coverallsapp/coverage-reporter/issues/124
https://github.com/coverallsapp/github-action/issues/205
additional refs:
https://github.com/coverallsapp/github-action?tab=readme-ov-file
https://github.com/coverallsapp/coverage-reporter#supported-coverage-report-formats

10 of 15 new or added lines in 4 files covered. (66.67%)

2 existing lines in 1 file now uncovered.

3061 of 4646 relevant lines covered (65.88%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

70.63
/read_utils.py
1
#!/usr/bin/env python3
2
"""
3
Utilities for working with sequence reads, such as converting formats and
4
fixing mate pairs.
5
"""
6

7
__author__ = "irwin@broadinstitute.org, dpark@broadinstitute.org"
1✔
8
__commands__ = []
1✔
9

10
import argparse
1✔
11
import logging
1✔
12
import math
1✔
13
import os
1✔
14
import tempfile
1✔
15
import shutil
1✔
16
import sys
1✔
17
import concurrent.futures
1✔
18
from contextlib import contextmanager
1✔
19
import functools
1✔
20

21
from Bio import SeqIO
1✔
22
import pysam
1✔
23

24
import util.cmd
1✔
25
import util.file
1✔
26
import util.misc
1✔
27
from util.file import mkstempfname
1✔
28
import tools.bwa
1✔
29
import tools.cdhit
1✔
30
import tools.picard
1✔
31
import tools.samtools
1✔
32
import tools.minimap2
1✔
33
import tools.mvicuna
1✔
34
import tools.prinseq
1✔
35
import tools.novoalign
1✔
36
import tools.gatk
1✔
37

38
log = logging.getLogger(__name__)
1✔
39

40

41
# ===============================
42
# ***  index_fasta_samtools   ***
43
# ===============================
44

45

46
def parser_index_fasta_samtools(parser=argparse.ArgumentParser()):
1✔
47
    parser.add_argument('inFasta', help='Reference genome, FASTA format.')
1✔
48
    util.cmd.common_args(parser, (('loglevel', None), ('version', None)))
1✔
49
    util.cmd.attach_main(parser, main_index_fasta_samtools)
1✔
50
    return parser
1✔
51

52

53
def main_index_fasta_samtools(args):
1✔
54
    '''Index a reference genome for Samtools.'''
55
    tools.samtools.SamtoolsTool().faidx(args.inFasta, overwrite=True)
×
56
    return 0
×
57

58

59
__commands__.append(('index_fasta_samtools', parser_index_fasta_samtools))
1✔
60

61
# =============================
62
# ***  index_fasta_picard   ***
63
# =============================
64

65

66
def parser_index_fasta_picard(parser=argparse.ArgumentParser()):
1✔
67
    parser.add_argument('inFasta', help='Input reference genome, FASTA format.')
1✔
68
    parser.add_argument(
1✔
69
        '--JVMmemory',
70
        default=tools.picard.CreateSequenceDictionaryTool.jvmMemDefault,
71
        help='JVM virtual memory size (default: %(default)s)'
72
    )
73
    parser.add_argument(
1✔
74
        '--picardOptions',
75
        default=[],
76
        nargs='*',
77
        help='Optional arguments to Picard\'s CreateSequenceDictionary, OPTIONNAME=value ...'
78
    )
79
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
80
    util.cmd.attach_main(parser, main_index_fasta_picard)
1✔
81
    return parser
1✔
82

83

84
def main_index_fasta_picard(args):
1✔
85
    '''Create an index file for a reference genome suitable for Picard/GATK.'''
86
    tools.picard.CreateSequenceDictionaryTool().execute(
×
87
        args.inFasta, overwrite=True,
88
        picardOptions=args.picardOptions,
89
        JVMmemory=args.JVMmemory
90
    )
91
    return 0
×
92

93

94
__commands__.append(('index_fasta_picard', parser_index_fasta_picard))
1✔
95

96
# =============================
97
# ***  mkdup_picard   ***
98
# =============================
99

100

101
def parser_mkdup_picard(parser=argparse.ArgumentParser()):
1✔
102
    parser.add_argument('inBams', help='Input reads, BAM format.', nargs='+')
1✔
103
    parser.add_argument('outBam', help='Output reads, BAM format.')
1✔
104
    parser.add_argument('--outMetrics', help='Output metrics file. Default is to dump to a temp file.', default=None)
1✔
105
    parser.add_argument(
1✔
106
        "--remove",
107
        help="Instead of marking duplicates, remove them entirely (default: %(default)s)",
108
        default=False,
109
        action="store_true",
110
        dest="remove"
111
    )
112
    parser.add_argument(
1✔
113
        '--JVMmemory',
114
        default=tools.picard.MarkDuplicatesTool.jvmMemDefault,
115
        help='JVM virtual memory size (default: %(default)s)'
116
    )
117
    parser.add_argument(
1✔
118
        '--picardOptions',
119
        default=[],
120
        nargs='*',
121
        help='Optional arguments to Picard\'s MarkDuplicates, OPTIONNAME=value ...'
122
    )
123
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
124
    util.cmd.attach_main(parser, main_mkdup_picard)
1✔
125
    return parser
1✔
126

127

128
def main_mkdup_picard(args):
1✔
129
    '''Mark or remove duplicate reads from BAM file.'''
130
    opts = list(args.picardOptions)
×
131
    if args.remove:
×
132
        opts = ['REMOVE_DUPLICATES=true'] + opts
×
133
    tools.picard.MarkDuplicatesTool().execute(
×
134
        args.inBams, args.outBam,
135
        args.outMetrics, picardOptions=opts,
136
        JVMmemory=args.JVMmemory
137
    )
138
    return 0
×
139

140

141
__commands__.append(('mkdup_picard', parser_mkdup_picard))
1✔
142

143
# =============================
144
# ***  revert_bam_picard   ***
145
# =============================
146

147
def parser_revert_sam_common(parser=argparse.ArgumentParser()):
1✔
148
    parser.add_argument('--clearTags', dest='clear_tags', default=False, action='store_true', 
1✔
149
                        help='When supplying an aligned input file, clear the per-read attribute tags')
150
    parser.add_argument("--tagsToClear", type=str, nargs='+', dest="tags_to_clear", default=["XT", "X0", "X1", "XA", 
1✔
151
                        "AM", "SM", "BQ", "CT", "XN", "OC", "OP"], 
152
                        help='A space-separated list of tags to remove from all reads in the input bam file (default: %(default)s)')
153
    parser.add_argument(
1✔
154
        '--doNotSanitize',
155
        dest="do_not_sanitize",
156
        action='store_true',
157
        help="""When being reverted, picard's SANITIZE=true
158
                is set unless --doNotSanitize is given. 
159
                Sanitization is a destructive operation that removes reads so the bam file is consistent.
160
                From the picard documentation:
161
                    'Reads discarded include (but are not limited to) paired reads with missing mates, duplicated records, 
162
                    records with mismatches in length of bases and qualities.'
163
                For more information see:
164
                    https://broadinstitute.github.io/picard/command-line-overview.html#RevertSam
165
                """
166
    )
167
    try:
1✔
168
        parser.add_argument(
1✔
169
            '--JVMmemory',
170
            default=tools.picard.RevertSamTool.jvmMemDefault,
171
            help='JVM virtual memory size (default: %(default)s)'
172
        )
173
    except argparse.ArgumentError:
1✔
174
        # if --JVMmemory is already present in the parser, continue on...
175
        pass
1✔
176
    return parser
1✔
177

178
def parser_revert_bam_picard(parser=argparse.ArgumentParser()):
1✔
179
    parser.add_argument('inBam', help='Input reads, BAM format.')
1✔
180
    parser.add_argument('outBam', help='Output reads, BAM format.')
1✔
181
    parser.add_argument(
1✔
182
        '--JVMmemory',
183
        default=tools.picard.RevertSamTool.jvmMemDefault,
184
        help='JVM virtual memory size (default: %(default)s)'
185
    )
186
    parser.add_argument(
1✔
187
        '--picardOptions',
188
        default=[],
189
        nargs='*',
190
        help='Optional arguments to Picard\'s RevertSam, OPTIONNAME=value ...'
191
    )
192
    parser = parser_revert_sam_common(parser)
1✔
193
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
194
    util.cmd.attach_main(parser, main_revert_bam_picard, split_args=True)
1✔
195
    return parser
1✔
196

197

198
def main_revert_bam_picard(inBam, outBam, clear_tags=False, tags_to_clear=None, picardOptions=None, do_not_sanitize=False, JVMmemory=None):
1✔
199
    '''Revert BAM to raw reads'''
200
    picardOptions = picardOptions or []
×
201
    tags_to_clear = tags_to_clear or []
×
202

203
    if clear_tags:
×
204
        for tag in tags_to_clear:
×
205
            picardOptions.append("ATTRIBUTE_TO_CLEAR={}".format(tag))
×
206
            
207
            if not do_not_sanitize and not any(opt.startswith("SANITIZE") for opt in picardOptions):
×
208
                picardOptions.append('SANITIZE=true')
×
209

210
    tools.picard.RevertSamTool().execute(inBam, outBam, picardOptions=picardOptions, JVMmemory=JVMmemory)
×
211
    return 0
×
212
__commands__.append(('revert_bam_picard', parser_revert_bam_picard))
1✔
213

214
@contextmanager
1✔
215
def revert_bam_if_aligned(inBam, revert_bam=None, clear_tags=True, tags_to_clear=None, picardOptions=None, JVMmemory="4g", sanitize=False):
1✔
216
    '''
217
        revert the bam file if aligned. 
218
        If a revertBam file path is specified, it is used, otherwise a temp file is created.
219
    '''
220

221
    revertBamOut = revert_bam if revert_bam else util.file.mkstempfname('.bam')
×
222
    picardOptions = picardOptions or []
×
223
    tags_to_clear = tags_to_clear or []
×
224

225
    outBam = inBam
×
226
    with pysam.AlignmentFile(inBam, 'rb', check_sq=False) as bam:
×
227
        # if it looks like the bam is aligned, revert it
228
        if 'SQ' in bam.header and len(bam.header['SQ'])>0:
×
229
            if clear_tags:
×
230
                for tag in tags_to_clear:
×
231
                    picardOptions.append("ATTRIBUTE_TO_CLEAR={}".format(tag))
×
232

233
            if not any(opt.startswith("SORT_ORDER") for opt in picardOptions):
×
234
                picardOptions.append('SORT_ORDER=queryname')
×
235

236
            if sanitize and not any(opt.startswith("SANITIZE") for opt in picardOptions):
×
237
                picardOptions.append('SANITIZE=true')
×
238

239
            tools.picard.RevertSamTool().execute(
×
240
                inBam, revertBamOut, picardOptions=picardOptions 
241
            )
242
            outBam = revertBamOut
×
243
        else:
244
            # if we don't need to produce a revertBam file
245
            # but the user has specified one anyway
246
            # simply touch the output
247
            if revert_bam:
×
248
                log.warning("An output was specified for 'revertBam', but the input is unaligned, so RevertSam was not needed. Touching the output.")
×
249
                util.file.touch(revertBamOut)
×
250
                # TODO: error out? run RevertSam anyway?
251

252
    yield outBam
×
253

254
    if revert_bam == None:
×
255
        os.unlink(revertBamOut)
×
256

257
# =========================
258
# ***  generic picard   ***
259
# =========================
260

261

262
def parser_picard(parser=argparse.ArgumentParser()):
1✔
263
    parser.add_argument('command', help='picard command')
1✔
264
    parser.add_argument(
1✔
265
        '--JVMmemory',
266
        default=tools.picard.PicardTools.jvmMemDefault,
267
        help='JVM virtual memory size (default: %(default)s)'
268
    )
269
    parser.add_argument(
1✔
270
        '--picardOptions',
271
        default=[], nargs='*',
272
        help='Optional arguments to Picard, OPTIONNAME=value ...'
273
    )
274
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
275
    util.cmd.attach_main(parser, main_picard)
1✔
276
    return parser
1✔
277

278

279
def main_picard(args):
1✔
280
    '''Generic Picard runner.'''
281
    tools.picard.PicardTools().execute(args.command, picardOptions=args.picardOptions, JVMmemory=args.JVMmemory)
×
282
    return 0
×
283

284

285
__commands__.append(('picard', parser_picard))
1✔
286

287
# ===================
288
# ***  sort_bam   ***
289
# ===================
290

291

292
def parser_sort_bam(parser=argparse.ArgumentParser()):
1✔
293
    parser.add_argument('inBam', help='Input bam file.')
1✔
294
    parser.add_argument('outBam', help='Output bam file, sorted.')
1✔
295
    parser.add_argument(
1✔
296
        'sortOrder',
297
        help='How to sort the reads. [default: %(default)s]',
298
        choices=tools.picard.SortSamTool.valid_sort_orders,
299
        default=tools.picard.SortSamTool.default_sort_order
300
    )
301
    parser.add_argument(
1✔
302
        "--index",
303
        help="Index outBam (default: %(default)s)",
304
        default=False,
305
        action="store_true",
306
        dest="index"
307
    )
308
    parser.add_argument(
1✔
309
        "--md5",
310
        help="MD5 checksum outBam (default: %(default)s)",
311
        default=False,
312
        action="store_true",
313
        dest="md5"
314
    )
315
    parser.add_argument(
1✔
316
        '--JVMmemory',
317
        default=tools.picard.SortSamTool.jvmMemDefault,
318
        help='JVM virtual memory size (default: %(default)s)'
319
    )
320
    parser.add_argument(
1✔
321
        '--picardOptions',
322
        default=[],
323
        nargs='*',
324
        help='Optional arguments to Picard\'s SortSam, OPTIONNAME=value ...'
325
    )
326
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
327
    util.cmd.attach_main(parser, main_sort_bam)
1✔
328
    return parser
1✔
329

330

331
def main_sort_bam(args):
1✔
332
    '''Sort BAM file'''
333
    opts = list(args.picardOptions)
×
334
    if args.index:
×
335
        opts = ['CREATE_INDEX=true'] + opts
×
336
    if args.md5:
×
337
        opts = ['CREATE_MD5_FILE=true'] + opts
×
338
    tools.picard.SortSamTool().execute(
×
339
        args.inBam, args.outBam, args.sortOrder,
340
        picardOptions=opts, JVMmemory=args.JVMmemory
341
    )
342
    return 0
×
343

344

345
__commands__.append(('sort_bam', parser_sort_bam))
1✔
346

347
# ====================
348
# ***  downsample_bams  ***
349
# ====================
350

351
def parser_downsample_bams(parser=argparse.ArgumentParser()):
1✔
352
    parser.add_argument('in_bams', help='Input bam files.', nargs='+')
1✔
353
    parser.add_argument('--outPath', dest="out_path", type=str, help="""Output path. If not provided, 
1✔
354
                        downsampled bam files will be written to the same paths as each 
355
                        source bam file""")
356
    parser.add_argument('--readCount', dest="specified_read_count", type=int, help='The number of reads to downsample to.')
1✔
357
    group = parser.add_mutually_exclusive_group()
1✔
358
    group.add_argument('--deduplicateBefore', action='store_true', dest="deduplicate_before", help='de-duplicate reads before downsampling.')
1✔
359
    group.add_argument('--deduplicateAfter', action='store_true', dest="deduplicate_after", help='de-duplicate reads after downsampling.')
1✔
360
    parser.add_argument(
1✔
361
        '--JVMmemory',
362
        default=tools.picard.DownsampleSamTool.jvmMemDefault,
363
        help='JVM virtual memory size (default: %(default)s)'
364
    )
365
    parser.add_argument(
1✔
366
        '--picardOptions',
367
        default=[],
368
        nargs='*',
369
        help='Optional arguments to Picard\'s DownsampleSam, OPTIONNAME=value ...'
370
    )
371
    util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
372
    util.cmd.attach_main(parser, main_downsample_bams, split_args=True)
1✔
373
    return parser
1✔
374

375

376
def main_downsample_bams(in_bams, out_path, specified_read_count=None, deduplicate_before=False, deduplicate_after=False, picardOptions=None, threads=None, JVMmemory=None):
1✔
377
    '''Downsample multiple bam files to the smallest read count in common, or to the specified count.'''
378
    if picardOptions is None:
1✔
379
        picardOptions = []
1✔
380

381
    opts = list(picardOptions) + []
1✔
382

383
    def get_read_counts(bams):
1✔
384
        samtools = tools.samtools.SamtoolsTool()
1✔
385
        # get read counts for bam files provided
386
        read_counts = {}
1✔
387
        with concurrent.futures.ThreadPoolExecutor(max_workers=util.misc.sanitize_thread_count(threads)) as executor:
1✔
388
            for x, result in zip(bams, executor.map(samtools.count, bams)):
1✔
389
                read_counts[x] = int(result)
1✔
390

391
        return read_counts
1✔
392

393
    def get_downsample_target_count(bams, readcount_requested):
1✔
394
        read_counts = get_read_counts(bams)
1✔
395
        downsample_target = 0
1✔
396
        if readcount_requested is not None:
1✔
397
            downsample_target = readcount_requested
1✔
398
            if downsample_target > min(read_counts.values()):
1✔
399
                raise ValueError("The smallest file has %s reads, which is less than the downsample target specified, %s. Please reduce the target count or omit it to use the read count of the smallest input file." % (min(read_counts.values()), downsample_target))
1✔
400
        else:
401
            downsample_target = min(read_counts.values())
1✔
402
        return downsample_target
1✔
403

404
    def downsample_bams(data_pairs, downsample_target, threads=None, JVMmemory=None):
1✔
405
        downsamplesam = tools.picard.DownsampleSamTool()
1✔
406
        workers = util.misc.sanitize_thread_count(threads)
1✔
407
        JVMmemory = JVMmemory if JVMmemory else tools.picard.DownsampleSamTool.jvmMemDefault
1✔
408
        
409
        jvm_worker_memory_mb = str(int(util.misc.convert_size_str(JVMmemory,"m")[:-1])//workers)+"m"
1✔
410
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1✔
411
            future_to_file = {executor.submit(downsamplesam.downsample_to_approx_count, *(list(fp)+[downsample_target]), JVMmemory=jvm_worker_memory_mb): fp[0] for fp in data_pairs}
1✔
412
            for future in concurrent.futures.as_completed(future_to_file):
1✔
413
                f = future_to_file[future]
1✔
414
                try:
1✔
415
                    data = future.result()
1✔
416
                except Exception as exc:
×
417
                    raise
×
418

419
    def dedup_bams(data_pairs, threads=None, JVMmemory=None):
1✔
420
        workers = util.misc.sanitize_thread_count(threads)
1✔
421
        JVMmemory = JVMmemory if JVMmemory else tools.picard.DownsampleSamTool.jvmMemDefault
1✔
422
        jvm_worker_memory_mb = str(int(util.misc.convert_size_str(JVMmemory,"m")[:-1])//workers)+"m"
1✔
423
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1✔
424
            future_to_file = {executor.submit(rmdup_mvicuna_bam, *fp, JVMmemory=jvm_worker_memory_mb): fp[0] for fp in data_pairs}
1✔
425
            for future in concurrent.futures.as_completed(future_to_file):
1✔
426
                f = future_to_file[future]
1✔
427
                try:
1✔
428
                    data = future.result()
1✔
429
                except Exception as exc:
×
430
                    raise
×
431

432
    data_pairs = []
1✔
433
    if deduplicate_before:
1✔
434
        with util.file.tempfnames(suffixes=[ '{}.dedup.bam'.format(os.path.splitext(os.path.basename(x))[0]) for x in in_bams]) as deduped_bams:
1✔
435
            data_pairs = list(zip(in_bams, deduped_bams))
1✔
436
            dedup_bams(data_pairs, threads=threads, JVMmemory=JVMmemory)
1✔
437
            downsample_target = get_downsample_target_count(deduped_bams, specified_read_count)
1✔
438
            
439
            if out_path:
1✔
440
                util.file.mkdir_p(out_path)
1✔
441
                data_pairs = list(zip(deduped_bams, [os.path.join(out_path, os.path.splitext(os.path.basename(x))[0]+".dedup.downsampled-{}.bam".format(downsample_target)) for x in in_bams]))
1✔
442
            else:
443
                data_pairs = list(zip(deduped_bams, [os.path.splitext(x)[0]+".dedup.downsampled-{}.bam".format(downsample_target) for x in in_bams]))
×
444
            downsample_bams(data_pairs, downsample_target, threads=threads, JVMmemory=JVMmemory)
1✔
445
    else:
446
        downsample_target = get_downsample_target_count(in_bams, specified_read_count)
1✔
447
        if deduplicate_after:
1✔
448
            log.warning("--deduplicateAfter has been specified. Read count of output files is not guaranteed.")
1✔
449
            with util.file.tempfnames(suffixes=[ '{}.downsampled-{}.bam'.format(os.path.splitext(os.path.basename(x))[0], downsample_target) for x in in_bams]) as downsampled_tmp_bams:
1✔
450
                data_pairs = list(zip(in_bams, downsampled_tmp_bams))
1✔
451
                downsample_bams(data_pairs, downsample_target, threads=threads, JVMmemory=JVMmemory)
1✔
452
                
453
                if out_path:
1✔
454
                    util.file.mkdir_p(out_path)
1✔
455
                    data_pairs = list(zip(downsampled_tmp_bams, [os.path.join(out_path, os.path.splitext(os.path.basename(x))[0]+".downsampled-{}.dedup.bam").format(downsample_target) for x in in_bams]))
1✔
456
                else:
457
                    data_pairs = list(zip(downsampled_tmp_bams, [os.path.splitext(x)[0]+".downsampled-{}.dedup.bam".format(downsample_target) for x in in_bams]))
×
458
                dedup_bams(data_pairs, threads=threads, JVMmemory=JVMmemory)
1✔
459
        else:
460
            if out_path:
1✔
461
                util.file.mkdir_p(out_path)
1✔
462
                data_pairs = list(zip(in_bams, [os.path.join(out_path, os.path.splitext(os.path.basename(x))[0]+".downsampled-{}.bam".format(downsample_target)) for x in in_bams]))
1✔
463
            else:
464
                data_pairs = list(zip(in_bams, [os.path.splitext(x)[0]+".downsampled-{}.bam".format(downsample_target) for x in in_bams]))
1✔
465
            downsample_bams(data_pairs, downsample_target, threads=threads, JVMmemory=JVMmemory)
1✔
466
    return 0
1✔
467

468
__commands__.append(('downsample_bams', parser_downsample_bams))
1✔
469

470
# ====================
471
# ***  merge_bams  ***
472
# ====================
473

474

475
def parser_merge_bams(parser=argparse.ArgumentParser()):
1✔
476
    parser.add_argument('inBams', help='Input bam files.', nargs='+')
1✔
477
    parser.add_argument('outBam', help='Output bam file.')
1✔
478
    parser.add_argument(
1✔
479
        '--JVMmemory',
480
        default=tools.picard.MergeSamFilesTool.jvmMemDefault,
481
        help='JVM virtual memory size (default: %(default)s)'
482
    )
483
    parser.add_argument(
1✔
484
        '--picardOptions',
485
        default=[],
486
        nargs='*',
487
        help='Optional arguments to Picard\'s MergeSamFiles, OPTIONNAME=value ...'
488
    )
489
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
490
    util.cmd.attach_main(parser, main_merge_bams)
1✔
491
    return parser
1✔
492

493

494
def main_merge_bams(args):
1✔
495
    '''Merge multiple BAMs into one'''
496
    opts = list(args.picardOptions) + ['USE_THREADING=true']
×
497
    tools.picard.MergeSamFilesTool().execute(args.inBams, args.outBam, picardOptions=opts, JVMmemory=args.JVMmemory)
×
498
    return 0
×
499

500

501
__commands__.append(('merge_bams', parser_merge_bams))
1✔
502

503
# ====================
504
# ***  filter_bam  ***
505
# ====================
506

507

508
def parser_filter_bam(parser=argparse.ArgumentParser()):
1✔
509
    parser.add_argument('inBam', help='Input bam file.')
1✔
510
    parser.add_argument('readList', help='Input file of read IDs.')
1✔
511
    parser.add_argument('outBam', help='Output bam file.')
1✔
512
    parser.add_argument(
1✔
513
        "--exclude",
514
        help="""If specified, readList is a list of reads to remove from input.
515
            Default behavior is to treat readList as an inclusion list (all unnamed
516
            reads are removed).""",
517
        default=False,
518
        action="store_true",
519
        dest="exclude"
520
    )
521
    parser.add_argument(
1✔
522
        '--JVMmemory',
523
        default=tools.picard.FilterSamReadsTool.jvmMemDefault,
524
        help='JVM virtual memory size (default: %(default)s)'
525
    )
526
    parser.add_argument(
1✔
527
        '--picardOptions',
528
        default=[],
529
        nargs='*',
530
        help='Optional arguments to Picard\'s FilterSamReads, OPTIONNAME=value ...'
531
    )
532
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
533
    util.cmd.attach_main(parser, main_filter_bam)
1✔
534
    return parser
1✔
535

536

537
def main_filter_bam(args):
1✔
538
    '''Filter BAM file by read name'''
539
    tools.picard.FilterSamReadsTool().execute(
×
540
        args.inBam,
541
        args.exclude,
542
        args.readList,
543
        args.outBam,
544
        picardOptions=args.picardOptions,
545
        JVMmemory=args.JVMmemory
546
    )
547
    return 0
×
548

549

550
__commands__.append(('filter_bam', parser_filter_bam))
1✔
551

552

553

554
# =======================
555
# ***  fastq_to_bam   ***
556
# =======================
557

558

559
def fastq_to_bam(
1✔
560
    inFastq1,
561
    inFastq2,
562
    outBam,
563
    sampleName=None,
564
    header=None,
565
    JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault,
566
    picardOptions=None
567
):
568
    ''' Convert a pair of fastq paired-end read files and optional text header
569
        to a single bam file.
570
    '''
571
    picardOptions = picardOptions or []
1✔
572

573
    if header:
1✔
574
        fastqToSamOut = mkstempfname('.bam')
1✔
575
    else:
576
        fastqToSamOut = outBam
1✔
577
    if sampleName is None:
1✔
578
        sampleName = 'Dummy'    # Will get overwritten by rehead command
1✔
579
    if header:
1✔
580
        # With the header option, rehead will be called after FastqToSam.
581
        # This will invalidate any md5 file, which would be a slow to construct
582
        # on our own, so just disallow and let the caller run md5sum if desired.
583
        if any(opt.lower() == 'CREATE_MD5_FILE=True'.lower() for opt in picardOptions):
1✔
584
            raise Exception("""CREATE_MD5_FILE is not allowed with '--header.'""")
×
585
    tools.picard.FastqToSamTool().execute(
1✔
586
        inFastq1, inFastq2,
587
        sampleName, fastqToSamOut,
588
        picardOptions=picardOptions,
589
        JVMmemory=JVMmemory
590
    )
591

592
    if header:
1✔
593
        tools.samtools.SamtoolsTool().reheader(fastqToSamOut, header, outBam)
1✔
594

595
    return 0
1✔
596

597

598
def parser_fastq_to_bam(parser=argparse.ArgumentParser()):
1✔
599
    parser.add_argument('inFastq1', help='Input fastq file; 1st end of paired-end reads.')
1✔
600
    parser.add_argument('inFastq2', help='Input fastq file; 2nd end of paired-end reads.')
1✔
601
    parser.add_argument('outBam', help='Output bam file.')
1✔
602
    headerGroup = parser.add_mutually_exclusive_group(required=True)
1✔
603
    headerGroup.add_argument('--sampleName', help='Sample name to insert into the read group header.')
1✔
604
    headerGroup.add_argument('--header', help='Optional text file containing header.')
1✔
605
    parser.add_argument(
1✔
606
        '--JVMmemory',
607
        default=tools.picard.FastqToSamTool.jvmMemDefault,
608
        help='JVM virtual memory size (default: %(default)s)'
609
    )
610
    parser.add_argument(
1✔
611
        '--picardOptions',
612
        default=[],
613
        nargs='*',
614
        help='''Optional arguments to Picard\'s FastqToSam,
615
                OPTIONNAME=value ...  Note that header-related options will be
616
                overwritten by HEADER if present.'''
617
    )
618
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
619
    util.cmd.attach_main(parser, fastq_to_bam, split_args=True)
1✔
620
    return parser
1✔
621

622

623
__commands__.append(('fastq_to_bam', parser_fastq_to_bam))
1✔
624

625

626
def join_paired_fastq(
1✔
627
        inFastqs,
628
        output,
629
        outFormat
630
):
631
    ''' Join paired fastq reads into single reads with Ns
632
    '''
633
    inFastqs = list(inFastqs)
×
634
    if output == '-':
×
635
        output = sys.stdout
×
636
    SeqIO.write(util.file.join_paired_fastq(inFastqs, output_format=outFormat), output, outFormat)
×
637
    return 0
×
638

639

640
def parser_join_paired_fastq(parser=argparse.ArgumentParser()):
1✔
641
    parser.add_argument('output', help='Output file.')
1✔
642
    parser.add_argument('inFastqs', nargs='+', help='Input fastq file (2 if paired, 1 if interleaved)')
1✔
643
    parser.add_argument('--outFormat', default='fastq', help='Output file format.')
1✔
644
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
645
    util.cmd.attach_main(parser, join_paired_fastq, split_args=True)
1✔
646
    return parser
1✔
647

648
__commands__.append(('join_paired_fastq', parser_join_paired_fastq))
1✔
649

650

651
# ======================
652
# ***  split_reads   ***
653
# ======================
654
defaultIndexLen = 2
1✔
655
defaultMaxReads = 1000
1✔
656
defaultFormat = 'fastq'
1✔
657

658

659
def split_bam(inBam, outBams):
1✔
660
    '''Split BAM file equally into several output BAM files. '''
661
    samtools = tools.samtools.SamtoolsTool()
×
662
    picard = tools.picard.PicardTools()
×
663

664
    # get totalReadCount and maxReads
665
    # maxReads = totalReadCount / num files, but round up to the nearest
666
    # even number in order to keep read pairs together (assuming the input
667
    # is sorted in query order and has no unmated reads, which can be
668
    # accomplished by Picard RevertSam with SANITIZE=true)
669
    totalReadCount = samtools.count(inBam)
×
670
    maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
×
671
    log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads)
×
672

673
    # load BAM header into memory
674
    header = samtools.getHeader(inBam)
×
675
    if 'SO:queryname' not in header[0]:
×
676
        raise Exception('Input BAM file must be sorted in queryame order')
×
677

678
    # dump to bigsam
679
    bigsam = mkstempfname('.sam')
×
680
    samtools.view(['-@', '3'], inBam, bigsam)
×
681

682
    # split bigsam into little ones
683
    with util.file.open_or_gzopen(bigsam, 'rt') as inf:
×
684
        for outBam in outBams:
×
685
            log.info("preparing file " + outBam)
×
686
            tmp_sam_reads = mkstempfname('.sam')
×
687
            with open(tmp_sam_reads, 'wt') as outf:
×
688
                for row in header:
×
689
                    outf.write('\t'.join(row) + '\n')
×
690
                for _ in range(maxReads):
×
691
                    line = inf.readline()
×
692
                    if not line:
×
693
                        break
×
694
                    outf.write(line)
×
695
                if outBam == outBams[-1]:
×
696
                    for line in inf:
×
697
                        outf.write(line)
×
698
            picard.execute(
×
699
                "SamFormatConverter", [
700
                    'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING'
701
                ],
702
                JVMmemory='512m'
703
            )
704
            os.unlink(tmp_sam_reads)
×
705
    os.unlink(bigsam)
×
706

707

708
def parser_split_bam(parser=argparse.ArgumentParser()):
1✔
709
    parser.add_argument('inBam', help='Input BAM file.')
1✔
710
    parser.add_argument('outBams', nargs='+', help='Output BAM files')
1✔
711
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
712
    util.cmd.attach_main(parser, split_bam, split_args=True)
1✔
713
    return parser
1✔
714

715

716
__commands__.append(('split_bam', parser_split_bam))
1✔
717

718
# =======================
719
# ***  reheader_bam   ***
720
# =======================
721

722

723
def parser_reheader_bam(parser=argparse.ArgumentParser()):
1✔
724
    parser.add_argument('inBam', help='Input reads, BAM format.')
1✔
725
    parser.add_argument('rgMap', help='Tabular file containing three columns: field, old, new.')
1✔
726
    parser.add_argument('outBam', help='Output reads, BAM format.')
1✔
727
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
728
    util.cmd.attach_main(parser, main_reheader_bam)
1✔
729
    return parser
1✔
730

731

732
def main_reheader_bam(args):
1✔
733
    ''' Copy a BAM file (inBam to outBam) while renaming elements of the BAM header.
734
        The mapping file specifies which (key, old value, new value) mappings. For
735
        example:
736
            LB        lib1        lib_one
737
            SM        sample1        Sample_1
738
            SM        sample2        Sample_2
739
            SM        sample3        Sample_3
740
            CN        broad        BI
741
    '''
742
    # read mapping file
743
    mapper = dict((a + ':' + b, a + ':' + c) for a, b, c in util.file.read_tabfile(args.rgMap))
×
744
    # read and convert bam header
745
    header_file = mkstempfname('.sam')
×
746
    with open(header_file, 'wt') as outf:
×
747
        for row in tools.samtools.SamtoolsTool().getHeader(args.inBam):
×
748
            if row[0] == '@RG':
×
749
                row = [mapper.get(x, x) for x in row]
×
750
            outf.write('\t'.join(row) + '\n')
×
751
    # write new bam with new header
752
    tools.samtools.SamtoolsTool().reheader(args.inBam, header_file, args.outBam)
×
753
    os.unlink(header_file)
×
754
    return 0
×
755

756

757
__commands__.append(('reheader_bam', parser_reheader_bam))
1✔
758

759

760
def parser_reheader_bams(parser=argparse.ArgumentParser()):
1✔
761
    parser.add_argument('rgMap', help='Tabular file containing three columns: field, old, new.')
1✔
762
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
763
    util.cmd.attach_main(parser, main_reheader_bams)
1✔
764
    return parser
1✔
765

766

767
def main_reheader_bams(args):
1✔
768
    ''' Copy BAM files while renaming elements of the BAM header.
769
        The mapping file specifies which (key, old value, new value) mappings. For
770
        example:
771
            LB  lib1  lib_one
772
            SM  sample1 Sample_1
773
            SM  sample2 Sample_2
774
            SM  sample3 Sample_3
775
            CN  broad   BI
776
            FN  in1.bam out1.bam
777
            FN  in2.bam out2.bam
778
    '''
779
    # read mapping file
780
    mapper = dict((a + ':' + b, a + ':' + c) for a, b, c in util.file.read_tabfile(args.rgMap) if a != 'FN')
×
781
    files = list((b, c) for a, b, c in util.file.read_tabfile(args.rgMap) if a == 'FN')
×
782
    header_file = mkstempfname('.sam')
×
783
    # read and convert bam headers
784
    for inBam, outBam in files:
×
785
        if os.path.isfile(inBam):
×
786
            with open(header_file, 'wt') as outf:
×
787
                for row in tools.samtools.SamtoolsTool().getHeader(inBam):
×
788
                    if row[0] == '@RG':
×
789
                        row = [mapper.get(x, x) for x in row]
×
790
                    outf.write('\t'.join(row) + '\n')
×
791
            # write new bam with new header
792
            tools.samtools.SamtoolsTool().reheader(inBam, header_file, outBam)
×
793
    os.unlink(header_file)
×
794
    return 0
×
795

796

797
__commands__.append(('reheader_bams', parser_reheader_bams))
1✔
798

799
# ============================
800
# ***  dup_remove_mvicuna  ***
801
# ============================
802

803

804
def mvicuna_fastqs_to_readlist(inFastq1, inFastq2, readList):
1✔
805
    # Run M-Vicuna on FASTQ files
806
    outFastq1 = mkstempfname('.1.fastq')
×
807
    outFastq2 = mkstempfname('.2.fastq')
×
808
    if inFastq2 is None or os.path.getsize(inFastq2) < 10:
×
809
        tools.mvicuna.MvicunaTool().rmdup_single(inFastq1, outFastq1)
×
810
    else:
811
        tools.mvicuna.MvicunaTool().rmdup((inFastq1, inFastq2), (outFastq1, outFastq2), None)
×
812

813
    # Make a list of reads to keep
814
    with open(readList, 'at') as outf:
×
815
        for fq in (outFastq1, outFastq2):
×
816
            with util.file.open_or_gzopen(fq, 'rt') as inf:
×
817
                line_num = 0
×
818
                for line in inf:
×
819
                    if (line_num % 4) == 0:
×
820
                        idVal = line.rstrip('\n')[1:]
×
821
                        if idVal.endswith('/1'):
×
822
                            outf.write(idVal[:-2] + '\n')
×
823
                        # single-end reads do not have /1 /2 mate suffix
824
                        # so pass through their IDs
825
                        if not (idVal.endswith('/1') or idVal.endswith('/2')):
×
826
                            outf.write(idVal + '\n')
×
827
                    line_num += 1
×
828
    os.unlink(outFastq1)
×
829
    os.unlink(outFastq2)
×
830

831

832
def rmdup_cdhit_bam(inBam, outBam, max_mismatches=None, jvm_memory=None):
1✔
833
    ''' Remove duplicate reads from BAM file using cd-hit-dup.
834
    '''
835
    max_mismatches = max_mismatches or 4
1✔
836
    tmp_dir = tempfile.mkdtemp()
1✔
837

838
    tools.picard.SplitSamByLibraryTool().execute(inBam, tmp_dir)
1✔
839

840
    s2fq_tool = tools.picard.SamToFastqTool()
1✔
841
    cdhit = tools.cdhit.CdHit()
1✔
842
    out_bams = []
1✔
843
    for f in os.listdir(tmp_dir):
1✔
844
        out_bam = mkstempfname('.bam')
1✔
845
        out_bams.append(out_bam)
1✔
846
        library_sam = os.path.join(tmp_dir, f)
1✔
847

848
        in_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq')
1✔
849

850
        s2fq_tool.execute(library_sam, in_fastqs[0], in_fastqs[1])
1✔
851
        if not os.path.getsize(in_fastqs[0]) > 0 and not os.path.getsize(in_fastqs[1]) > 0:
1✔
852
            continue
1✔
853

854
        out_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq')
1✔
855
        options = {
1✔
856
            '-e': max_mismatches,
857
        }
858
        if in_fastqs[1] is not None and os.path.getsize(in_fastqs[1]) > 10:
1✔
859
            options['-i2'] = in_fastqs[1]
1✔
860
            options['-o2'] = out_fastqs[1]
1✔
861

862
        log.info("executing cd-hit-est on library " + library_sam)
1✔
863
        # cd-hit-dup cannot operate on piped fastq input because it reads twice
864
        cdhit.execute('cd-hit-dup', in_fastqs[0], out_fastqs[0], options=options, background=True)
1✔
865

866
        tools.picard.FastqToSamTool().execute(out_fastqs[0], out_fastqs[1], f, out_bam, JVMmemory=jvm_memory)
1✔
867
        for fn in in_fastqs:
1✔
868
            os.unlink(fn)
1✔
869

870
    with util.file.fifo(name='merged.sam') as merged_bam:
1✔
871
        merge_opts = ['SORT_ORDER=queryname']
1✔
872
        tools.picard.MergeSamFilesTool().execute(out_bams, merged_bam, picardOptions=merge_opts, JVMmemory=jvm_memory, background=True)
1✔
873
        tools.picard.ReplaceSamHeaderTool().execute(merged_bam, inBam, outBam, JVMmemory=jvm_memory)
1✔
874

875

876
def parser_rmdup_cdhit_bam(parser=argparse.ArgumentParser()):
1✔
877
    parser.add_argument('inBam', help='Input reads, BAM format.')
1✔
878
    parser.add_argument('outBam', help='Output reads, BAM format.')
1✔
879
    parser.add_argument(
1✔
880
        '--JVMmemory',
881
        default=tools.picard.FilterSamReadsTool.jvmMemDefault,
882
        help='JVM virtual memory size (default: %(default)s)',
883
        dest='jvm_memory'
884
    )
885
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
886
    util.cmd.attach_main(parser, rmdup_cdhit_bam, split_args=True)
1✔
887
    return parser
1✔
888

889

890
__commands__.append(('rmdup_cdhit_bam', parser_rmdup_cdhit_bam))
1✔
891

892
def _merge_fastqs_and_mvicuna(lb, files):
1✔
893
    readList = mkstempfname('.keep_reads.txt')
×
894
    log.info("executing M-Vicuna DupRm on library " + lb)
×
895

896
    # create merged FASTQs per library
897
    infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq'))
×
898
    for d in range(2):
×
899
        with open(infastqs[d], 'wt') as outf:
×
900
            for fprefix in files:
×
901
                fn = '%s_%d.fastq' % (fprefix, d + 1)
×
902

903
                if os.path.isfile(fn):
×
904
                    with open(fn, 'rt') as inf:
×
905
                        for line in inf:
×
906
                            outf.write(line)
×
907
                    os.unlink(fn)
×
908
                else:
909
                    log.warning(
×
910
                        """no reads found in %s,
911
                                assuming that's because there's no reads in that read group""", fn
912
                    )
913

914
    # M-Vicuna DupRm to see what we should keep (append IDs to running file)
915
    if os.path.getsize(infastqs[0]) > 0 or os.path.getsize(infastqs[1]) > 0:
×
916
        mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList)
×
917
    for fn in infastqs:
×
918
        os.unlink(fn)
×
919

920
    return readList
×
921

922
def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None, threads=None):
1✔
923
    ''' Remove duplicate reads from BAM file using M-Vicuna. The
924
        primary advantage to this approach over Picard's MarkDuplicates tool
925
        is that Picard requires that input reads are aligned to a reference,
926
        and M-Vicuna can operate on unaligned reads.
927
    '''
928

929
    # Convert BAM -> FASTQ pairs per read group and load all read groups
930
    tempDir = tempfile.mkdtemp()
1✔
931
    tools.picard.SamToFastqTool().per_read_group(inBam, tempDir, picardOptions=['VALIDATION_STRINGENCY=LENIENT'], JVMmemory=JVMmemory)
1✔
932
    read_groups = [x[1:] for x in tools.samtools.SamtoolsTool().getHeader(inBam) if x[0] == '@RG']
1✔
933
    read_groups = [dict(pair.split(':', 1) for pair in rg) for rg in read_groups]
1✔
934

935
    # Collect FASTQ pairs for each library
936
    lb_to_files = {}
1✔
937
    for rg in read_groups:
1✔
938
        lb_to_files.setdefault(rg.get('LB', 'none'), set())
1✔
939
        fname = rg['ID']
1✔
940
        lb_to_files[rg.get('LB', 'none')].add(os.path.join(tempDir, fname))
1✔
941
    log.info("found %d distinct libraries and %d read groups", len(lb_to_files), len(read_groups))
1✔
942

943
    # For each library, merge FASTQs and run rmdup for entire library
944
    readListAll = mkstempfname('.keep_reads_all.txt')
1✔
945
    per_lb_read_lists = []
1✔
946
    with concurrent.futures.ProcessPoolExecutor(max_workers=threads or util.misc.available_cpu_count()) as executor:
1✔
947
        futures = [executor.submit(_merge_fastqs_and_mvicuna, lb, files) for lb, files in lb_to_files.items()]
1✔
948
        for future in concurrent.futures.as_completed(futures):
1✔
949
            log.info("mvicuna finished processing library")
1✔
950
            try:
1✔
951
                readList = future.result()
1✔
952
                per_lb_read_lists.append(readList)
1✔
953
            except Exception as exc:
×
954
                log.error('mvicuna process call generated an exception: %s' % (exc))
×
955

956
    # merge per-library read lists together
957
    util.file.concat(per_lb_read_lists, readListAll)
1✔
958
    # remove per-library read lists
959
    for fl in per_lb_read_lists:
1✔
960
        os.unlink(fl)
1✔
961

962
    # Filter original input BAM against keep-list
963
    tools.picard.FilterSamReadsTool().execute(inBam, False, readListAll, outBam, JVMmemory=JVMmemory)
1✔
964
    return 0
1✔
965

966

967
def parser_rmdup_mvicuna_bam(parser=argparse.ArgumentParser()):
1✔
968
    parser.add_argument('inBam', help='Input reads, BAM format.')
1✔
969
    parser.add_argument('outBam', help='Output reads, BAM format.')
1✔
970
    parser.add_argument(
1✔
971
        '--JVMmemory',
972
        default=tools.picard.FilterSamReadsTool.jvmMemDefault,
973
        help='JVM virtual memory size (default: %(default)s)'
974
    )
975
    util.cmd.common_args(parser, (('threads',None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
976
    util.cmd.attach_main(parser, rmdup_mvicuna_bam, split_args=True)
1✔
977
    return parser
1✔
978

979

980
__commands__.append(('rmdup_mvicuna_bam', parser_rmdup_mvicuna_bam))
1✔
981

982

983

984
def parser_rmdup_prinseq_fastq(parser=argparse.ArgumentParser()):
1✔
985
    parser.add_argument('inFastq1', help='Input fastq file; 1st end of paired-end reads.')
1✔
986
    parser.add_argument('inFastq2', help='Input fastq file; 2nd end of paired-end reads.')
1✔
987
    parser.add_argument('outFastq1', help='Output fastq file; 1st end of paired-end reads.')
1✔
988
    parser.add_argument('outFastq2', help='Output fastq file; 2nd end of paired-end reads.')
1✔
989
    parser.add_argument(
1✔
990
        "--includeUnmated",
991
        help="Include unmated reads in the main output fastq files (default: %(default)s)",
992
        default=False,
993
        action="store_true"
994
    )
995
    parser.add_argument('--unpairedOutFastq1', default=None, help='File name of output unpaired reads from 1st end of paired-end reads (independent of --includeUnmated)')
1✔
996
    parser.add_argument('--unpairedOutFastq2', default=None, help='File name of output unpaired reads from 2nd end of paired-end reads (independent of --includeUnmated)')
1✔
997
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
998
    util.cmd.attach_main(parser, main_rmdup_prinseq_fastq)
1✔
999
    return parser
1✔
1000

1001

1002
def main_rmdup_prinseq_fastq(args):
1✔
1003
    ''' Run prinseq-lite's duplicate removal operation on paired-end
1004
        reads.  Also removes reads with more than one N.
1005
    '''
1006
    prinseq = tools.prinseq.PrinseqTool()
×
1007
    prinseq.rmdup_fastq_paired(args.inFastq1, args.inFastq2, args.outFastq1, args.outFastq2, args.includeUnmated, args.unpairedOutFastq1, args.unpairedOutFastq2)
×
1008
    return 0
×
1009

1010

1011
__commands__.append(('rmdup_prinseq_fastq', parser_rmdup_prinseq_fastq))
1✔
1012

1013

1014
def filter_bam_mapped_only(inBam, outBam):
1✔
1015
    ''' Samtools to reduce a BAM file to only reads that are
1016
        aligned (-F 4) with a non-zero mapping quality (-q 1)
1017
        and are not marked as a PCR/optical duplicate (-F 1024).
1018
    '''
1019
    tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'], inBam, outBam)
×
1020
    tools.picard.BuildBamIndexTool().execute(outBam)
×
1021
    return 0
×
1022

1023

1024
def parser_filter_bam_mapped_only(parser=argparse.ArgumentParser()):
1✔
1025
    parser.add_argument('inBam', help='Input aligned reads, BAM format.')
1✔
1026
    parser.add_argument('outBam', help='Output sorted indexed reads, filtered to aligned-only, BAM format.')
1✔
1027
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1028
    util.cmd.attach_main(parser, filter_bam_mapped_only, split_args=True)
1✔
1029
    return parser
1✔
1030

1031

1032
__commands__.append(('filter_bam_mapped_only', parser_filter_bam_mapped_only))
1✔
1033

1034
# ======= Novoalign ========
1035

1036

1037
def parser_novoalign(parser=argparse.ArgumentParser()):
1✔
1038
    parser.add_argument('inBam', help='Input reads, BAM format.')
1✔
1039
    parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Novoindex.')
1✔
1040
    parser.add_argument('outBam', help='Output reads, BAM format (aligned).')
1✔
1041
    parser.add_argument('--options', default='-r Random', help='Novoalign options (default: %(default)s)')
1✔
1042
    parser.add_argument('--min_qual', default=0, help='Filter outBam to minimum mapping quality (default: %(default)s)')
1✔
1043
    parser.add_argument(
1✔
1044
        '--JVMmemory',
1045
        default=tools.picard.SortSamTool.jvmMemDefault,
1046
        help='JVM virtual memory size (default: %(default)s)'
1047
    )
1048
    parser.add_argument(
1✔
1049
        '--NOVOALIGN_LICENSE_PATH',
1050
        default=None,
1051
        dest="novoalign_license_path",
1052
        help='A path to the novoalign.lic file. This overrides the NOVOALIGN_LICENSE_PATH environment variable. (default: %(default)s)'
1053
    )
1054
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1055
    util.cmd.attach_main(parser, main_novoalign)
1✔
1056
    return parser
1✔
1057

1058

1059
def main_novoalign(args):
1✔
1060
    '''Align reads with Novoalign. Sort and index BAM output.'''
1061
    tools.novoalign.NovoalignTool(license_path=args.novoalign_license_path).execute(
×
1062
        args.inBam,
1063
        args.refFasta,
1064
        args.outBam,
1065
        options=args.options.split(),
1066
        min_qual=args.min_qual,
1067
        JVMmemory=args.JVMmemory
1068
    )
1069
    return 0
×
1070

1071

1072
__commands__.append(('novoalign', parser_novoalign))
1✔
1073

1074

1075
def parser_novoindex(parser=argparse.ArgumentParser()):
1✔
1076
    parser.add_argument('refFasta', help='Reference genome, FASTA format.')
1✔
1077
    parser.add_argument(
1✔
1078
        '--NOVOALIGN_LICENSE_PATH',
1079
        default=None,
1080
        dest="novoalign_license_path",
1081
        help='A path to the novoalign.lic file. This overrides the NOVOALIGN_LICENSE_PATH environment variable. (default: %(default)s)'
1082
    )
1083
    util.cmd.common_args(parser, (('loglevel', None), ('version', None)))
1✔
1084
    util.cmd.attach_main(parser, main_novoindex)
1✔
1085
    return parser
1✔
1086

1087
def main_novoindex(args):
1✔
1088
    tools.novoalign.NovoalignTool(license_path=args.novoalign_license_path).index_fasta(args.refFasta)
×
1089
    return 0
×
1090

1091

1092
__commands__.append(('novoindex', parser_novoindex))
1✔
1093

1094
# ========= GATK ==========
1095

1096

1097
def parser_gatk_ug(parser=argparse.ArgumentParser()):
1✔
1098
    parser.add_argument('inBam', help='Input reads, BAM format.')
1✔
1099
    parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard.')
1✔
1100
    parser.add_argument(
1✔
1101
        'outVcf',
1102
        help='''Output calls in VCF format. If this filename ends with .gz,
1103
        GATK will BGZIP compress the output and produce a Tabix index file as well.'''
1104
    )
1105
    parser.add_argument(
1✔
1106
        '--options',
1107
        default='--min_base_quality_score 15 -ploidy 4',
1108
        help='UnifiedGenotyper options (default: %(default)s)'
1109
    )
1110
    parser.add_argument(
1✔
1111
        '--JVMmemory',
1112
        default=tools.gatk.GATKTool.jvmMemDefault,
1113
        help='JVM virtual memory size (default: %(default)s)'
1114
    )
1115
    parser.add_argument(
1✔
1116
        '--GATK_PATH',
1117
        default=None,
1118
        help='A path containing the GATK jar file. This overrides the GATK_ENV environment variable or the GATK conda package. (default: %(default)s)'
1119
    )
1120
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1121
    util.cmd.attach_main(parser, main_gatk_ug)
1✔
1122
    return parser
1✔
1123

1124

1125
def main_gatk_ug(args):
1✔
1126
    '''Call genotypes using the GATK UnifiedGenotyper.'''
1127
    tools.gatk.GATKTool(path=args.GATK_PATH).ug(
×
1128
        args.inBam, args.refFasta,
1129
        args.outVcf, options=args.options.split(),
1130
        JVMmemory=args.JVMmemory
1131
    )
1132
    return 0
×
1133

1134

1135
__commands__.append(('gatk_ug', parser_gatk_ug))
1✔
1136

1137

1138
def parser_gatk_realign(parser=argparse.ArgumentParser()):
1✔
1139
    parser.add_argument('inBam', help='Input reads, BAM format, aligned to refFasta.')
1✔
1140
    parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard.')
1✔
1141
    parser.add_argument('outBam', help='Realigned reads.')
1✔
1142
    parser.add_argument(
1✔
1143
        '--JVMmemory',
1144
        default=tools.gatk.GATKTool.jvmMemDefault,
1145
        help='JVM virtual memory size (default: %(default)s)'
1146
    )
1147
    parser.add_argument(
1✔
1148
        '--GATK_PATH',
1149
        default=None,
1150
        help='A path containing the GATK jar file. This overrides the GATK_ENV environment variable or the GATK conda package. (default: %(default)s)'
1151
    )
1152
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1153
    util.cmd.attach_main(parser, main_gatk_realign)
1✔
1154
    parser.add_argument('--threads', type=int, default=None, help='Number of threads (default: all available cores)')
1✔
1155
    return parser
1✔
1156

1157

1158
def main_gatk_realign(args):
1✔
1159
    '''Local realignment of BAM files with GATK IndelRealigner.'''
1160
    tools.gatk.GATKTool(path=args.GATK_PATH).local_realign(
×
1161
        args.inBam, args.refFasta,
1162
        args.outBam, JVMmemory=args.JVMmemory,
1163
        threads=args.threads,
1164
    )
1165
    return 0
×
1166

1167

1168
__commands__.append(('gatk_realign', parser_gatk_realign))
1✔
1169

1170
# =========================
1171

1172

1173
def align_and_fix(
1✔
1174
    inBam, refFasta,
1175
    outBamAll=None,
1176
    outBamFiltered=None,
1177
    aligner_options='',
1178
    aligner="novoalign",
1179
    bwa_min_score=None,
1180
    novoalign_amplicons_bed=None,
1181
    amplicon_window=4,
1182
    JVMmemory=None,
1183
    threads=None,
1184
    skip_mark_dupes=False,
1185
    gatk_path=None,
1186
    novoalign_license_path=None
1187
):
1188
    ''' Take reads, align to reference with Novoalign, minimap2, or BWA-MEM.
1189
        Optionally mark duplicates with Picard, realign indels with GATK,
1190
        and optionally filters final file to mapped/non-dupe reads.
1191
    '''
1192
    if not (outBamAll or outBamFiltered):
1✔
1193
        log.warning("are you sure you meant to do nothing?")
×
1194
        return
×
1195

1196
    assert aligner in ["novoalign", "bwa", "minimap2"]
1✔
1197
    samtools = tools.samtools.SamtoolsTool()
1✔
1198

1199
    refFastaCopy = mkstempfname('.ref_copy.fasta')
1✔
1200
    shutil.copyfile(refFasta, refFastaCopy)
1✔
1201

1202
    tools.picard.CreateSequenceDictionaryTool().execute(refFastaCopy, overwrite=True, JVMmemory=JVMmemory)
1✔
1203
    samtools.faidx(refFastaCopy, overwrite=True)
1✔
1204

1205
    if aligner_options is None:
1✔
1206
        if aligner=="novoalign":
1✔
1207
            aligner_options = '-r Random'
1✔
1208
        else:
1209
            aligner_options = '' # use defaults
1✔
1210

1211
    if samtools.isEmpty(inBam):
1✔
1212
        log.warning("zero reads present in input")
1✔
1213

1214
    bam_aligned = mkstempfname('.aligned.bam')
1✔
1215
    if aligner=="novoalign":
1✔
1216
        if novoalign_amplicons_bed is not None:
1✔
1217
            aligner_options += ' --amplicons {} {}'.format(novoalign_amplicons_bed, amplicon_window)
×
1218

1219
        tools.novoalign.NovoalignTool(license_path=novoalign_license_path).index_fasta(refFastaCopy)
1✔
1220

1221
        tools.novoalign.NovoalignTool(license_path=novoalign_license_path).execute(
1✔
1222
            inBam, refFastaCopy, bam_aligned,
1223
            options=aligner_options.split(),
1224
            JVMmemory=JVMmemory
1225
        )
1226

1227
    elif aligner=='bwa':
1✔
1228
        bwa = tools.bwa.Bwa()
1✔
1229
        bwa.index(refFastaCopy)
1✔
1230

1231
        opts = aligner_options.split()
1✔
1232

1233
        bwa.align_mem_bam(inBam, refFastaCopy, bam_aligned, min_score_to_filter=bwa_min_score, threads=threads, options=opts)
1✔
1234

1235
    elif aligner=='minimap2':
1✔
1236
        mm2 = tools.minimap2.Minimap2()
1✔
1237
        mm2.align_bam(inBam, refFastaCopy, bam_aligned, threads=threads, options=aligner_options.split())
1✔
1238

1239
    if skip_mark_dupes:
1✔
1240
        bam_marked = bam_aligned
×
1241
    else:
1242
        bam_marked = mkstempfname('.mkdup.bam')
1✔
1243
        tools.picard.MarkDuplicatesTool().execute(
1✔
1244
            [bam_aligned], bam_marked, picardOptions=['CREATE_INDEX=true'],
1245
            JVMmemory=JVMmemory
1246
        )
1247
        os.unlink(bam_aligned)
1✔
1248

1249
    samtools.index(bam_marked)
1✔
1250

1251
    if samtools.isEmpty(bam_marked):
1✔
1252
        bam_realigned = bam_marked
1✔
1253
    else:
1254
        bam_realigned = mkstempfname('.realigned.bam')
1✔
1255
        tools.gatk.GATKTool(path=gatk_path).local_realign(bam_marked, refFastaCopy, bam_realigned, JVMmemory=JVMmemory, threads=threads)
1✔
1256
        os.unlink(bam_marked)
1✔
1257

1258
    if outBamAll:
1✔
1259
        shutil.copyfile(bam_realigned, outBamAll)
1✔
1260
        tools.picard.BuildBamIndexTool().execute(outBamAll, JVMmemory=JVMmemory)
1✔
1261
    if outBamFiltered:
1✔
1262
        filtered_any_mapq = mkstempfname('.filtered_any_mapq.bam')
1✔
1263
        # filter based on read flags
1264
        samtools.filter_to_proper_primary_mapped_reads(bam_realigned, filtered_any_mapq)
1✔
1265
        # remove reads with MAPQ <1
1266
        samtools.view(['-b', '-q', '1'], filtered_any_mapq, outBamFiltered)
1✔
1267
        os.unlink(filtered_any_mapq)
1✔
1268
        tools.picard.BuildBamIndexTool().execute(outBamFiltered, JVMmemory=JVMmemory)
1✔
1269
    os.unlink(bam_realigned)
1✔
1270

1271

1272
def parser_align_and_fix(parser=argparse.ArgumentParser()):
1✔
1273
    parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
1✔
1274
    parser.add_argument('refFasta', help='Reference genome, FASTA format; will be indexed by Picard and Novoalign.')
1✔
1275
    parser.add_argument(
1✔
1276
        '--outBamAll',
1277
        default=None,
1278
        help='''Aligned, sorted, and indexed reads.  Unmapped and duplicate reads are
1279
                retained. By default, duplicate reads are marked. If "--skipMarkDupes"
1280
                is specified duplicate reads are included in outout without being marked.'''
1281
    )
1282
    parser.add_argument(
1✔
1283
        '--outBamFiltered',
1284
        default=None,
1285
        help='''Aligned, sorted, and indexed reads.  Unmapped reads are removed from this file,
1286
                as well as any marked duplicate reads. Note that if "--skipMarkDupes" is provided,
1287
                duplicates will be not be marked and will be included in the output.'''
1288
    )
1289
    parser.add_argument('--aligner_options', default=None, help='aligner options (default for novoalign: "-r Random", bwa: "-T 30"')
1✔
1290
    parser.add_argument('--aligner', choices=['novoalign', 'minimap2', 'bwa'], default='novoalign', help='aligner (default: %(default)s)')
1✔
1291
    parser.add_argument('--bwa_min_score', type=int, default=None, help='BWA mem on paired reads ignores the -T parameter. Set a value here (e.g. 30) to invoke a custom post-alignment filter (default: no filtration)')
1✔
1292
    parser.add_argument('--novoalign_amplicons_bed', default=None, help='Novoalign only: amplicon primer file (BED format) to soft clip')
1✔
1293
    parser.add_argument('--amplicon_window', type=int, default=4, help='Novoalign only: amplicon primer window size (default: %(default)s)')
1✔
1294
    parser.add_argument('--JVMmemory', default='4g', help='JVM virtual memory size (default: %(default)s)')
1✔
1295
    parser.add_argument('--threads', type=int, default=None, help='Number of threads (default: all available cores)')
1✔
1296
    parser.add_argument('--skipMarkDupes',
1✔
1297
                        help='If specified, duplicate reads will not be marked in the resulting output file.',
1298
                        dest="skip_mark_dupes",
1299
                        action='store_true')
1300
    parser.add_argument(
1✔
1301
        '--GATK_PATH',
1302
        default=None,
1303
        dest="gatk_path",
1304
        help='A path containing the GATK jar file. This overrides the GATK_ENV environment variable or the GATK conda package. (default: %(default)s)'
1305
    )
1306
    parser.add_argument(
1✔
1307
        '--NOVOALIGN_LICENSE_PATH',
1308
        default=None,
1309
        dest="novoalign_license_path",
1310
        help='A path to the novoalign.lic file. This overrides the NOVOALIGN_LICENSE_PATH environment variable. (default: %(default)s)'
1311
    )
1312

1313
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1314
    util.cmd.attach_main(parser, align_and_fix, split_args=True)
1✔
1315
    return parser
1✔
1316

1317

1318
__commands__.append(('align_and_fix', parser_align_and_fix))
1✔
1319

1320
# =========================
1321

1322
def filter_bam_to_proper_primary_mapped_reads(inBam, outBam, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False):
1✔
1323
    ''' Take a BAM file and filter to only reads that are properly
1324
        paired and mapped. Optionally reject singletons, and 
1325
        optionally require reads to be properly paired and mapped.
1326

1327
        Output includes reads that are:
1328
            - not flagged as duplicates
1329
            - not secondary or supplementary (split/chimeric reads)
1330
            - For paired-end reads:
1331
            -   marked as proper pair (if require_pairs_to_be_proper=True) OR
1332
                both not unmapped (if require_pairs_to_be_proper=False) OR
1333
                not a member of a pair with a singleton (if reject_singletons=True)
1334
            - For single-end reads:
1335
                mapped
1336
    '''
1337
    samtools = tools.samtools.SamtoolsTool()
×
1338
    samtools.filter_to_proper_primary_mapped_reads(inBam, 
×
1339
                                                   outBam,
1340
                                                   require_pairs_to_be_proper=not doNotRequirePairsToBeProper,
1341
                                                   reject_singletons=not keepSingletons,
1342
                                                   reject_duplicates=not keepDuplicates)
1343
    return 0
×
1344

1345
def parser_filter_bam_to_proper_primary_mapped_reads(parser=argparse.ArgumentParser()):
1✔
1346
    parser.add_argument('inBam', help='Input aligned reads, BAM format.')
×
1347
    parser.add_argument('outBam', help='Output reads, BAM format.')
×
1348
    parser.add_argument(
×
1349
        '--doNotRequirePairsToBeProper',
1350
        help='Do not require reads to be properly paired when filtering (default: %(default)s)',
1351
        action='store_true'
1352
    )
1353
    parser.add_argument(
×
1354
        '--keepSingletons',
1355
        help='Keep singleton reads when filtering (default: %(default)s)',
1356
        action='store_true'
1357
    )
1358
    parser.add_argument(
×
1359
        '--keepDuplicates',
1360
        help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)',
1361
        action='store_true'
1362
    )
1363

1364
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
×
1365
    util.cmd.attach_main(parser, filter_bam_to_proper_primary_mapped_reads, split_args=True)
×
1366
    return parser
×
1367

1368
# =========================
1369

1370

1371
def minimap2_idxstats(inBam, refFasta, outBam=None, outStats=None,
1✔
1372
                      filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False):
1373
    ''' Take reads, align to reference with minimap2 and perform samtools idxstats.
1374
        Optionally filter reads after alignment, prior to reporting idxstats, to include only those flagged as properly paired.
1375
    '''
1376

1377
    assert outBam or outStats, "Either outBam or outStats must be specified"
×
1378

1379
    bam_aligned = util.file.mkstempfname('.aligned.bam')
×
1380
    if outBam is None:
×
1381
        bam_filtered = mkstempfname('.filtered.bam')
×
1382
    else:
1383
        bam_filtered = outBam
×
1384

1385
    samtools = tools.samtools.SamtoolsTool()
×
1386
    mm2 = tools.minimap2.Minimap2()
×
1387

1388
    ref_indexed = util.file.mkstempfname('.reference.fasta')
×
1389
    shutil.copyfile(refFasta, ref_indexed)
×
1390

NEW
1391
    if samtools.isEmpty(inBam):
×
NEW
1392
        log.warning("The input bam file appears to have zero reads: %s", inBam)
×
1393

UNCOV
1394
    mm2.align_bam(inBam, ref_indexed, bam_aligned)
×
1395
    
1396
    if filterReadsAfterAlignment:
×
1397
        samtools.filter_to_proper_primary_mapped_reads(bam_aligned, 
×
1398
                                                       bam_filtered, 
1399
                                                       require_pairs_to_be_proper=not doNotRequirePairsToBeProper, 
1400
                                                       reject_singletons=not keepSingletons,
1401
                                                       reject_duplicates=not keepDuplicates)
1402
        os.unlink(bam_aligned)
×
1403
    else:
1404
        shutil.move(bam_aligned, bam_filtered)
×
1405

1406
    if outStats is not None:
×
1407
        # index the final bam before calling idxstats
1408
        # but only if it is a bam or cram file (sam cannot be indexed)
NEW
1409
        if (bam_filtered.endswith(".bam") or bam_filtered.endswith(".cram")):
×
NEW
1410
            samtools.index(bam_filtered)
×
UNCOV
1411
        samtools.idxstats(bam_filtered, outStats)
×
1412

1413
    if outBam is None:
×
1414
        os.unlink(bam_filtered)
×
1415

1416
    
1417

1418
def parser_minimap2_idxstats(parser=argparse.ArgumentParser()):
1✔
1419
    parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
1✔
1420
    parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.')
1✔
1421
    parser.add_argument(
1✔
1422
        '--filterReadsAfterAlignment',
1423
        help=("If specified, reads till be filtered after alignment to include only those flagged as properly paired."
1424
                "This excludes secondary and supplementary alignments."),
1425
        action='store_true'
1426
    )
1427
    parser.add_argument(
1✔
1428
        '--doNotRequirePairsToBeProper',
1429
        help='Do not require reads to be properly paired when filtering (default: %(default)s)',
1430
        action='store_true'
1431
    )
1432
    parser.add_argument(
1✔
1433
        '--keepSingletons',
1434
        help='Keep singleton reads when filtering (default: %(default)s)',
1435
        action='store_true'
1436
    )
1437
    parser.add_argument(
1✔
1438
        '--keepDuplicates',
1439
        help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)',
1440
        action='store_true'
1441
    )
1442
    parser.add_argument('--outBam', help='Output aligned, indexed BAM file', default=None)
1✔
1443
    parser.add_argument('--outStats', help='Output idxstats file', default=None)
1✔
1444

1445
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1446
    util.cmd.attach_main(parser, minimap2_idxstats, split_args=True)
1✔
1447
    return parser
1✔
1448

1449
__commands__.append(('minimap2_idxstats', parser_minimap2_idxstats))
1✔
1450

1451

1452
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None,
1✔
1453
        min_score_to_filter=None, aligner_options=None,
1454
        filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False):
1455
    ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats.
1456
        Optionally filter reads after alignment, prior to reporting idxstats, to include only those flagged as properly paired.
1457
    '''
1458

1459
    assert outBam or outStats, "Either outBam or outStats must be specified"
1✔
1460

1461
    bam_aligned = util.file.mkstempfname('.aligned.bam')
1✔
1462
    if outBam is None:
1✔
1463
        bam_filtered = mkstempfname('.filtered.bam')
1✔
1464
    else:
1465
        bam_filtered = outBam
1✔
1466

1467
    samtools = tools.samtools.SamtoolsTool()
1✔
1468
    bwa = tools.bwa.Bwa()
1✔
1469

1470
    ref_indexed = util.file.mkstempfname('.reference.fasta')
1✔
1471
    shutil.copyfile(refFasta, ref_indexed)
1✔
1472
    bwa.index(ref_indexed)
1✔
1473

1474
    bwa_opts = [] if aligner_options is None else aligner_options.split()
1✔
1475
    bwa.mem(inBam, ref_indexed, bam_aligned, options=bwa_opts,
1✔
1476
            min_score_to_filter=min_score_to_filter)
1477
    
1478
    if filterReadsAfterAlignment:
1✔
1479
        samtools.filter_to_proper_primary_mapped_reads(bam_aligned, 
1✔
1480
                                                       bam_filtered, 
1481
                                                       require_pairs_to_be_proper=not doNotRequirePairsToBeProper, 
1482
                                                       reject_singletons=not keepSingletons,
1483
                                                       reject_duplicates=not keepDuplicates)
1484
        os.unlink(bam_aligned)
1✔
1485
    else:
1486
        shutil.move(bam_aligned, bam_filtered)
1✔
1487

1488
    if outStats is not None:
1✔
1489
        samtools.idxstats(bam_filtered, outStats)
1✔
1490

1491
    if outBam is None:
1✔
1492
        os.unlink(bam_filtered)
1✔
1493

1494

1495
def parser_bwamem_idxstats(parser=argparse.ArgumentParser()):
1✔
1496
    parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
1✔
1497
    parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.')
1✔
1498
    parser.add_argument('--outBam', help='Output aligned, indexed BAM file', default=None)
1✔
1499
    parser.add_argument('--outStats', help='Output idxstats file', default=None)
1✔
1500
    parser.add_argument(
1✔
1501
        '--minScoreToFilter',
1502
        dest="min_score_to_filter",
1503
        type=int,
1504
        help=("Filter bwa alignments using this value as the minimum allowed "
1505
              "alignment score. Specifically, sum the alignment scores across "
1506
              "all alignments for each query (including reads in a pair, "
1507
              "supplementary and secondary alignments) and then only include, "
1508
              "in the output, queries whose summed alignment score is at least "
1509
              "this value. This is only applied when the aligner is 'bwa'. "
1510
              "The filtering on a summed alignment score is sensible for reads "
1511
              "in a pair and supplementary alignments, but may not be "
1512
              "reasonable if bwa outputs secondary alignments (i.e., if '-a' "
1513
              "is in the aligner options). (default: not set - i.e., do not "
1514
              "filter bwa's output)")
1515
    )
1516
    parser.add_argument(
1✔
1517
        '--alignerOptions',
1518
        dest="aligner_options",
1519
        help="bwa options (default: bwa defaults)")
1520
    parser.add_argument(
1✔
1521
        '--filterReadsAfterAlignment',
1522
        help=("If specified, reads till be filtered after alignment to include only those flagged as properly paired."
1523
                "This excludes secondary and supplementary alignments."),
1524
        action='store_true'
1525
    )
1526
    parser.add_argument(
1✔
1527
        '--doNotRequirePairsToBeProper',
1528
        help='Do not require reads to be properly paired when filtering (default: %(default)s)',
1529
        action='store_true'
1530
    )
1531
    parser.add_argument(
1✔
1532
        '--keepSingletons',
1533
        help='Keep singleton reads when filtering (default: %(default)s)',
1534
        action='store_true'
1535
    )
1536
    parser.add_argument(
1✔
1537
        '--keepDuplicates',
1538
        help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)',
1539
        action='store_true'
1540
    )
1541

1542
    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1543
    util.cmd.attach_main(parser, bwamem_idxstats, split_args=True)
1✔
1544
    return parser
1✔
1545

1546
__commands__.append(('bwamem_idxstats', parser_bwamem_idxstats))
1✔
1547

1548

1549
def parser_extract_tarball(parser=argparse.ArgumentParser()):
1✔
1550
    parser.add_argument('tarfile', help='Input tar file. May be "-" for stdin.')
1✔
1551
    parser.add_argument('out_dir', help='Output directory')
1✔
1552
    parser.add_argument('--compression',
1✔
1553
        help='Compression type (default: %(default)s). Auto-detect is incompatible with stdin input unless pipe_hint is specified.',
1554
        choices=('gz', 'bz2', 'lz4', 'zip', 'none', 'auto'),
1555
        default='auto')
1556
    parser.add_argument('--pipe_hint',
1✔
1557
        help='If tarfile is stdin, you can provide a file-like URI string for pipe_hint which ends with a common compression file extension if you want to use compression=auto.',
1558
        default=None)
1559
    util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1560
    util.cmd.attach_main(parser, main_extract_tarball, split_args=True)
1✔
1561
    return parser
1✔
1562
def main_extract_tarball(*args, **kwargs):
1✔
1563
    ''' Extract an input .tar, .tgz, .tar.gz, .tar.bz2, .tar.lz4, or .zip file
1564
        to a given directory (or we will choose one on our own). Emit the
1565
        resulting directory path to stdout.
1566
    '''
1567
    print(util.file.extract_tarball(*args, **kwargs))
×
1568
__commands__.append(('extract_tarball', parser_extract_tarball))
1✔
1569

1570
# =========================
1571

1572
def fasta_read_names(in_fasta, out_read_names):
1✔
1573
    """Save the read names of reads in a .fasta file to a text file"""
1574
    with util.file.open_or_gzopen(in_fasta) as in_fasta_f, open(out_read_names, 'wt') as out_read_names_f:
×
1575
        last_read_name = None
×
1576
        for line in in_fasta_f:
×
1577
            if line.startswith('>'):
×
1578
                read_name = line[1:].strip()
×
1579
                if read_name.endswith('/1') or read_name.endswith('/2'):
×
1580
                    read_name = read_name[:-2]
×
1581
                if read_name != last_read_name:
×
1582
                    out_read_names_f.write(read_name+'\n')
×
1583
                last_read_name = read_name
×
1584

1585

1586
def read_names(in_reads, out_read_names, threads=None):
1✔
1587
    """Extract read names from a sequence file"""
1588
    _in_reads = in_reads
×
1589
    with util.file.tmp_dir(suffix='_read_names.txt') as t_dir:
×
1590
        if in_reads.endswith('.bam'):
×
1591
            _in_reads = os.path.join(t_dir, 'reads.fasta')
×
1592
            tools.samtools.SamtoolsTool().bam2fa(in_reads, _in_reads)
×
1593
        fasta_read_names(_in_reads, out_read_names)
×
1594

1595
def parser_read_names(parser=argparse.ArgumentParser()):
1✔
1596
    parser.add_argument('in_reads', help='the input reads ([compressed] fasta or bam)')
1✔
1597
    parser.add_argument('out_read_names', help='the read names')
1✔
1598
    util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
1✔
1599
    util.cmd.attach_main(parser, read_names, split_args=True)
1✔
1600
    return parser
1✔
1601

1602
__commands__.append(('read_names', parser_read_names))
1✔
1603

1604

1605
# =========================
1606

1607
def full_parser():
1✔
1608
    return util.cmd.make_parser(__commands__, __doc__)
×
1609

1610

1611
if __name__ == '__main__':
1✔
1612
    util.cmd.main_argparse(__commands__, __doc__)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc