• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

griffithlab / pVACtools / 12303071114

12 Dec 2024 07:04PM UTC coverage: 83.167% (+0.2%) from 83.008%
12303071114

Pull #911

github

web-flow
Merge 20a1f817d into 370ce3f06
Pull Request #911: Add new pVACsplice tool

1373 of 1624 new or added lines in 49 files covered. (84.54%)

1 existing line in 1 file now uncovered.

7396 of 8893 relevant lines covered (83.17%)

4.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.24
/pvactools/lib/pipeline.py
1
import copy
5✔
2
import logging
5✔
3
import shutil
5✔
4
import sys
5✔
5
from collections import OrderedDict
5✔
6

7
import pkg_resources
5✔
8
import pymp
5✔
9
import yaml
5✔
10
from Bio.Seq import Seq
5✔
11
from Bio.SeqRecord import SeqRecord
5✔
12

13
from pvactools.lib.prediction_class import *
5✔
14
from pvactools.lib.input_file_converter import VcfConverter
5✔
15
from pvactools.lib.fasta_generator import FastaGenerator, VectorFastaGenerator
5✔
16
from pvactools.lib.output_parser import DefaultOutputParser, UnmatchedSequencesOutputParser, PvacspliceOutputParser
5✔
17
from pvactools.lib.post_processor import PostProcessor
5✔
18
from pvactools.lib.run_utils import *
5✔
19
import pvactools.lib.call_iedb
5✔
20
import pvactools.lib.combine_parsed_outputs
5✔
21

22

23
def status_message(msg):
5✔
24
    print(msg)
5✔
25
    sys.stdout.flush()
5✔
26

27
class Pipeline(metaclass=ABCMeta):
5✔
28
    def __init__(self, **kwargs):
5✔
29
        for (k,v) in kwargs.items():
5✔
30
           setattr(self, k, v)
5✔
31
        self.flurry_state                = self.get_flurry_state()
5✔
32
        self.starfusion_file             = kwargs.pop('starfusion_file', None)
5✔
33
        self.proximal_variants_file      = None
5✔
34
        self.tmp_dir = os.path.join(self.output_dir, 'tmp')
5✔
35
        os.makedirs(self.tmp_dir, exist_ok=True)
5✔
36

37
    def log_dir(self):
5✔
38
        dir = os.path.join(self.output_dir, 'log')
5✔
39
        os.makedirs(dir, exist_ok=True),
5✔
40
        return dir
5✔
41

42
    def print_log(self):
5✔
43
        log_file = os.path.join(self.log_dir(), 'inputs.yml')
5✔
44
        if os.path.exists(log_file):
5✔
45
            with open(log_file, 'r') as log_fh:
5✔
46
                past_inputs = yaml.load(log_fh, Loader=yaml.FullLoader)
5✔
47
                current_inputs = self.__dict__
5✔
48
                current_inputs['pvactools_version'] = pkg_resources.get_distribution("pvactools").version
5✔
49
                if past_inputs['pvactools_version'] != current_inputs['pvactools_version']:
5✔
50
                    status_message(
×
51
                        "Restart to be executed with a different pVACtools version:\n" +
52
                        "Past version: %s\n" % past_inputs['pvactools_version'] +
53
                        "Current version: %s" % current_inputs['pvactools_version']
54
                    )
55
                for key in current_inputs.keys():
5✔
56
                    if key == 'pvactools_version' or key == 'pvacseq_version':
5✔
57
                        continue
×
58
                    if key not in past_inputs.keys() and current_inputs[key] is not None:
5✔
59
                        sys.exit(
×
60
                            "Restart inputs are different from past inputs: \n" +
61
                            "Additional input: %s - %s\n" % (key, current_inputs[key]) +
62
                            "Aborting."
63
                        )
64
                    elif current_inputs[key] != past_inputs[key]:
5✔
65
                        sys.exit(
5✔
66
                            "Restart inputs are different from past inputs: \n" +
67
                            "Past input: %s - %s\n" % (key, past_inputs[key]) +
68
                            "Current input: %s - %s\n" % (key, current_inputs[key]) +
69
                            "Aborting."
70
                        )
71
        else:
72
            with open(log_file, 'w') as log_fh:
5✔
73
                inputs = self.__dict__
5✔
74
                inputs['pvactools_version'] = pkg_resources.get_distribution("pvactools").version
5✔
75
                yaml.dump(inputs, log_fh, default_flow_style=False)
5✔
76

77
    def get_flurry_state(self):
5✔
78
        if 'MHCflurry' in self.prediction_algorithms and 'MHCflurryEL' in self.prediction_algorithms:
5✔
79
            self.prediction_algorithms.remove('MHCflurryEL')
×
80
            return 'both'
×
81
        elif 'MHCflurry' in self.prediction_algorithms:
5✔
82
            return 'BA_only'
×
83
        elif 'MHCflurryEL' in self.prediction_algorithms:
5✔
84
            pred_idx = self.prediction_algorithms.index('MHCflurryEL')
×
85
            self.prediction_algorithms[pred_idx] = 'MHCflurry'
×
86
            return 'EL_only'
×
87
        else:
88
            return None
5✔
89

90
    def tsv_file_path(self):
5✔
91
        if self.input_file_type == 'pvacvector_input_fasta':
5✔
92
            return self.input_file
5✔
93
        elif self.input_file_type == 'junctions':
5✔
94
            return os.path.join(self.junctions_dir, f'{self.sample_name}_combined.tsv')
5✔
95
        else:
96
            tsv_file = self.sample_name + '.tsv'
5✔
97
            return os.path.join(self.output_dir, tsv_file)
5✔
98

99
    def fasta_file_path(self):
5✔
100
        fasta_file = self.sample_name + '.fasta'
5✔
101
        return os.path.join(self.output_dir, fasta_file)
5✔
102

103
    def net_chop_fasta_file_path(self):
5✔
104
        net_chop_fasta_file = self.sample_name + '.net_chop.fa'
5✔
105
        return os.path.join(self.output_dir, net_chop_fasta_file)
5✔
106

107
    def converter(self, params):
5✔
108
        converter_types = {
5✔
109
            'vcf'  : 'VcfConverter',
110
            'junctions' : 'PvacspliceVcfConverter',
111
        }
112
        converter_type = converter_types[self.input_file_type]
5✔
113
        converter = getattr(sys.modules[__name__], converter_type)
5✔
114
        return converter(**params)
5✔
115

116
    def fasta_generator(self, params):
5✔
117
        generator_types = {
5✔
118
            'vcf'                   : 'FastaGenerator',
119
            'pvacvector_input_fasta': 'VectorFastaGenerator',
120
        }
121
        generator_type = generator_types[self.input_file_type]
5✔
122
        generator = getattr(sys.modules[__name__], generator_type)
5✔
123
        return generator(**params)
5✔
124

125
    def output_parser(self, params):
5✔
126
        parser_types = {
5✔
127
            'vcf': 'DefaultOutputParser',
128
            'pvacvector_input_fasta': 'UnmatchedSequencesOutputParser',
129
            'fasta': 'UnmatchedSequencesOutputParser',
130
            'junctions': 'PvacspliceOutputParser',
131
        }
132
        parser_type = parser_types[self.input_file_type]
5✔
133
        parser = getattr(sys.modules[__name__], parser_type)
5✔
134
        return parser(**params)
5✔
135

136
    def generate_combined_fasta(self, fasta_path, epitope_flank_length=0):
5✔
137
        params = [
5✔
138
            self.input_file,
139
            str(epitope_flank_length + max(self.epitope_lengths) - 1),
140
            fasta_path,
141
            "--sample-name", self.sample_name,
142
            "--biotypes" , ",".join(self.biotypes),
143
        ]
144
        import pvactools.tools.pvacseq.generate_protein_fasta as generate_combined_fasta
5✔
145
        if self.phased_proximal_variants_vcf is not None:
5✔
146
            params.extend(["--phased-proximal-variants-vcf", self.phased_proximal_variants_vcf])
5✔
147
        if self.downstream_sequence_length is not None:
5✔
148
            params.extend(["-d", str(self.downstream_sequence_length)])
5✔
149
        else:
150
            params.extend(["-d", 'full'])
5✔
151
        if self.pass_only:
5✔
152
            params.extend(["--pass-only"])
5✔
153
        generate_combined_fasta.main(params)
5✔
154
        os.unlink("{}.manufacturability.tsv".format(fasta_path))
5✔
155

156
    def convert_vcf(self):
5✔
157
        status_message("Converting .%s to TSV" % self.input_file_type)
5✔
158
        if os.path.exists(self.tsv_file_path()):
5✔
159
            status_message("TSV file already exists. Skipping.")
5✔
160
            return
5✔
161

162
        convert_params = {
5✔
163
            'input_file' : self.input_file,
164
            'output_file': self.tsv_file_path(),
165
            'sample_name': self.sample_name,
166
            'pass_only': self.pass_only,
167
            'biotypes': self.biotypes,
168
        }
169
        if self.normal_sample_name is not None:
5✔
170
            convert_params['normal_sample_name'] = self.normal_sample_name
×
171
        if self.phased_proximal_variants_vcf is not None:
5✔
172
            convert_params['proximal_variants_vcf'] = self.phased_proximal_variants_vcf
5✔
173
            proximal_variants_tsv = os.path.join(self.output_dir, self.sample_name + '.proximal_variants.tsv')
5✔
174
            convert_params['proximal_variants_tsv'] = proximal_variants_tsv
5✔
175
            self.proximal_variants_file = proximal_variants_tsv
5✔
176
            convert_params['flanking_bases'] = max(self.epitope_lengths) * 4
5✔
177

178
        converter = self.converter(convert_params)
5✔
179
        converter.execute()
5✔
180
        print("Completed")
5✔
181

182
    def tsv_entry_count(self):
5✔
183
        with open(self.tsv_file_path()) as tsv_file:
5✔
184
            reader  = csv.DictReader(tsv_file, delimiter='\t')
5✔
185
            row_count = 0
5✔
186
            for row in reader:
5✔
187
                row_count += 1
5✔
188
        return row_count
5✔
189

190
    def split_tsv_file(self, total_row_count):
5✔
191
        status_message("Splitting TSV into smaller chunks")
5✔
192
        tsv_size = self.fasta_size / 2
5✔
193
        chunks = []
5✔
194
        with open(self.tsv_file_path(), 'r') as tsv_file:
5✔
195
            reader      = csv.DictReader(tsv_file, delimiter='\t')
5✔
196
            row_count   = 1
5✔
197
            split_start = row_count
5✔
198
            split_end   = split_start + tsv_size - 1
5✔
199
            if split_end > total_row_count:
5✔
200
                split_end = total_row_count
5✔
201
            status_message("Splitting TSV into smaller chunks - Entries %d-%d" % (split_start, split_end))
5✔
202
            split_tsv_file_path = "%s_%d-%d" % (self.tsv_file_path(), split_start, split_end)
5✔
203
            chunks.append([split_start, split_end])
5✔
204
            if os.path.exists(split_tsv_file_path):
5✔
205
                status_message("Split TSV file for Entries %d-%d already exists. Skipping." % (split_start, split_end))
5✔
206
                skip = 1
5✔
207
            else:
208
                split_tsv_file      = open(split_tsv_file_path, 'w')
5✔
209
                split_tsv_writer    = csv.DictWriter(split_tsv_file, delimiter='\t', fieldnames = reader.fieldnames)
5✔
210
                split_tsv_writer.writeheader()
5✔
211
                skip = 0
5✔
212
            for row in reader:
5✔
213
                if skip == 0:
5✔
214
                    split_tsv_writer.writerow(row)
5✔
215
                if row_count == total_row_count:
5✔
216
                    break
5✔
217
                if row_count % tsv_size == 0:
5✔
218
                    if skip == 0:
×
219
                        split_tsv_file.close()
×
220
                    split_start = row_count + 1
×
221
                    split_end   = split_start + tsv_size - 1
×
222
                    if split_end > total_row_count:
×
223
                        split_end = total_row_count
×
224
                    status_message("Splitting TSV into smaller chunks - Entries %d-%d" % (split_start, split_end))
×
225
                    split_tsv_file_path = "%s_%d-%d" % (self.tsv_file_path(), split_start, split_end)
×
226
                    chunks.append([split_start, split_end])
×
227
                    if os.path.exists(split_tsv_file_path):
×
228
                        status_message("Split TSV file for Entries %d-%d already exists. Skipping." % (split_start, split_end))
×
229
                        skip = 1
×
230
                    else:
231
                        split_tsv_file      = open(split_tsv_file_path, 'w')
×
232
                        split_tsv_writer    = csv.DictWriter(split_tsv_file, delimiter='\t', fieldnames = reader.fieldnames)
×
233
                        split_tsv_writer.writeheader()
×
234
                        skip = 0
×
235
                row_count += 1
5✔
236
            if skip == 0:
5✔
237
                split_tsv_file.close()
5✔
238
        status_message("Completed")
5✔
239
        return chunks
5✔
240

241
    def generate_fasta(self, chunks):
5✔
242
        status_message("Generating Variant Peptide FASTA and Key Files")
5✔
243
        for (split_start, split_end) in chunks:
5✔
244
            tsv_chunk = "%d-%d" % (split_start, split_end)
5✔
245
            fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
5✔
246
            generate_fasta_params = {
5✔
247
                'downstream_sequence_length': self.downstream_sequence_length,
248
                'proximal_variants_file'    : self.proximal_variants_file,
249
            }
250
            if self.input_file_type == 'pvacvector_input_fasta':
5✔
251
                split_fasta_file_path = "{}_{}".format(self.split_fasta_basename(None), fasta_chunk)
5✔
252
                generate_fasta_params['input_file'] = self.tsv_file_path()
5✔
253
                generate_fasta_params['output_file_prefix'] = split_fasta_file_path
5✔
254
                generate_fasta_params['epitope_lengths'] = self.epitope_lengths
5✔
255
                generate_fasta_params['spacers'] = self.spacers
5✔
256
                status_message("Generating Variant Peptide FASTA and Key Files - Entries %s" % (fasta_chunk))
5✔
257
                fasta_generator = self.fasta_generator(generate_fasta_params)
5✔
258
                fasta_generator.execute()
5✔
259
            else:
260
                for epitope_length in self.epitope_lengths:
5✔
261
                    split_fasta_file_path = "{}_{}".format(self.split_fasta_basename(epitope_length), fasta_chunk)
5✔
262
                    if os.path.exists(split_fasta_file_path):
5✔
263
                        status_message("Split FASTA file for Epitope Length {} - Entries {} already exists. Skipping.".format(epitope_length, fasta_chunk))
5✔
264
                        continue
5✔
265
                    split_fasta_key_file_path = split_fasta_file_path + '.key'
5✔
266
                    generate_fasta_params['input_file'] = "%s_%s" % (self.tsv_file_path(), tsv_chunk)
5✔
267
                    generate_fasta_params['epitope_length'] = epitope_length
5✔
268
                    generate_fasta_params['flanking_sequence_length'] = epitope_length - 1
5✔
269
                    generate_fasta_params['output_file'] = split_fasta_file_path
5✔
270
                    generate_fasta_params['output_key_file'] = split_fasta_key_file_path
5✔
271
                    status_message("Generating Variant Peptide FASTA and Key Files - Epitope Length {} - Entries {}".format(epitope_length, fasta_chunk))
5✔
272
                    fasta_generator = self.fasta_generator(generate_fasta_params)
5✔
273
                    fasta_generator.execute()
5✔
274
        status_message("Completed")
5✔
275

276
    def split_fasta_basename(self, epitope_length):
5✔
277
        if epitope_length is None:
5✔
278
            return os.path.join(self.tmp_dir, "{}.fa.split".format(self.sample_name))
5✔
279
        else:
280
            return os.path.join(self.tmp_dir, "{}.{}.fa.split".format(self.sample_name, epitope_length))
5✔
281

282
    def call_iedb(self, chunks):
5✔
283
        alleles = self.alleles
5✔
284
        epitope_lengths = self.epitope_lengths
5✔
285
        prediction_algorithms = self.prediction_algorithms
5✔
286
        argument_sets = []
5✔
287
        warning_messages = []
5✔
288
        for (split_start, split_end) in chunks:
5✔
289
            tsv_chunk = "%d-%d" % (split_start, split_end)
5✔
290
            if self.input_file_type == 'fasta':
5✔
291
                fasta_chunk = tsv_chunk
×
292
            else:
293
                fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
5✔
294
            for a in alleles:
5✔
295
                for epl in epitope_lengths:
5✔
296
                    if self.input_file_type == 'pvacvector_input_fasta':
5✔
297
                        split_fasta_file_path = "{}_1-2.{}.tsv".format(self.split_fasta_basename(None), epl)
5✔
298
                    else:
299
                        split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(epl), fasta_chunk)
5✔
300
                    if os.path.getsize(split_fasta_file_path) == 0:
5✔
301
                        msg = "Fasta file {} is empty. Skipping".format(split_fasta_file_path)
×
302
                        if msg not in warning_messages:
×
303
                            warning_messages.append(msg)
×
304
                        continue
×
305
                    #begin of per-algorithm processing
306
                    for method in prediction_algorithms:
5✔
307
                        prediction_class = globals()[method]
5✔
308
                        prediction = prediction_class()
5✔
309
                        if hasattr(prediction, 'iedb_prediction_method'):
5✔
310
                            iedb_method = prediction.iedb_prediction_method
5✔
311
                        else:
312
                            iedb_method = method
×
313
                        valid_alleles = prediction.valid_allele_names()
5✔
314
                        if a not in valid_alleles:
5✔
315
                            msg = "Allele %s not valid for Method %s. Skipping." % (a, method)
5✔
316
                            if msg not in warning_messages:
5✔
317
                                warning_messages.append(msg)
5✔
318
                            continue
5✔
319
                        valid_lengths = prediction.valid_lengths_for_allele(a)
5✔
320
                        if epl not in valid_lengths:
5✔
321
                            msg = "Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % (epl, method, a)
×
322
                            if msg not in warning_messages:
×
323
                                warning_messages.append(msg)
×
324
                            continue
×
325

326
                        split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(epl), "tsv_%s" % fasta_chunk]))
5✔
327
                        if os.path.exists(split_iedb_out):
5✔
328
                            msg = "Prediction file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % (a, epl, method, fasta_chunk)
5✔
329
                            if msg not in warning_messages:
5✔
330
                                warning_messages.append(msg)
5✔
331
                            continue
5✔
332
                        arguments = [
5✔
333
                            split_fasta_file_path,
334
                            split_iedb_out,
335
                            method,
336
                            a,
337
                            '-r', str(self.iedb_retries),
338
                            '-e', self.iedb_executable,
339
                            '-l', str(epl),
340
                            '--tmp-dir', self.tmp_dir,
341
                            '--log-dir', self.log_dir(),
342
                        ]
343
                        argument_sets.append(arguments)
5✔
344

345
        for msg in warning_messages:
5✔
346
            status_message(msg)
5✔
347

348
        with pymp.Parallel(self.n_threads) as p:
5✔
349
            for index in p.range(len(argument_sets)):
5✔
350
                arguments = argument_sets[index]
5✔
351
                a = arguments[3]
5✔
352
                method = arguments[2]
5✔
353
                filename = arguments[1]
5✔
354
                epl = arguments[9]
5✔
355
                p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s" % (a, epl, method, filename))
5✔
356
                pvactools.lib.call_iedb.main(arguments)
5✔
357
                p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s - Completed" % (a, epl, method, filename))
5✔
358

359
    def parse_outputs(self, chunks):
5✔
360
        split_parsed_output_files = []
5✔
361
        for (split_start, split_end) in chunks:
5✔
362
            tsv_chunk = "%d-%d" % (split_start, split_end)
5✔
363
            if self.input_file_type in ['fasta', 'junctions']:
5✔
364
                fasta_chunk = tsv_chunk
×
365
            else:
366
                fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
5✔
367
            for a in self.alleles:
5✔
368
                for epl in self.epitope_lengths:
5✔
369
                    split_iedb_output_files = []
5✔
370
                    for method in self.prediction_algorithms:
5✔
371
                        prediction_class = globals()[method]
5✔
372
                        prediction = prediction_class()
5✔
373
                        if hasattr(prediction, 'iedb_prediction_method'):
5✔
374
                            iedb_method = prediction.iedb_prediction_method
5✔
375
                        else:
376
                            iedb_method = method
×
377
                        valid_alleles = prediction.valid_allele_names()
5✔
378
                        if a not in valid_alleles:
5✔
379
                            continue
5✔
380
                        valid_lengths = prediction.valid_lengths_for_allele(a)
5✔
381
                        if epl not in valid_lengths:
5✔
382
                            continue
×
383
                        split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(epl), "tsv_%s" % fasta_chunk]))
5✔
384
                        if os.path.exists(split_iedb_out):
5✔
385
                            split_iedb_output_files.append(split_iedb_out)
5✔
386

387
                    split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, str(epl), "parsed", "tsv_%s" % fasta_chunk]))
5✔
388
                    if os.path.exists(split_parsed_file_path):
5✔
389
                        status_message("Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, epl, fasta_chunk))
5✔
390
                        split_parsed_output_files.append(split_parsed_file_path)
5✔
391
                        continue
5✔
392
                    if self.input_file_type == 'pvacvector_input_fasta':
5✔
393
                        split_fasta_file_path = "{}_1-2.{}.tsv".format(self.split_fasta_basename(None), epl)
5✔
394
                    else:
395
                        split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(epl), fasta_chunk)
5✔
396
                    split_fasta_key_file_path = split_fasta_file_path + '.key'
5✔
397

398
                    if len(split_iedb_output_files) > 0:
5✔
399
                        status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk))
5✔
400
                        split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk)
5✔
401
                        params = {
5✔
402
                            'input_iedb_files'       : split_iedb_output_files,
403
                            'input_tsv_file'         : split_tsv_file_path,
404
                            'key_file'               : split_fasta_key_file_path,
405
                            'output_file'            : split_parsed_file_path,
406
                        }
407
                        params['sample_name'] = self.sample_name
5✔
408
                        params['flurry_state'] = self.flurry_state
5✔
409
                        if self.additional_report_columns and 'sample_name' in self.additional_report_columns:
5✔
410
                            params['add_sample_name_column'] = True 
5✔
411
                        parser = self.output_parser(params)
5✔
412
                        parser.execute()
5✔
413
                        status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s - Completed" % (a, epl, fasta_chunk))
5✔
414

415
                        split_parsed_output_files.append(split_parsed_file_path)
5✔
416
        return split_parsed_output_files
5✔
417

418
    def combined_parsed_path(self):
5✔
419
        combined_parsed = "%s.all_epitopes.tsv" % self.sample_name
5✔
420
        return os.path.join(self.output_dir, combined_parsed)
5✔
421

422
    def combined_parsed_outputs(self, split_parsed_output_files):
5✔
423
        status_message("Combining Parsed Prediction Files")
5✔
424
        params = [
5✔
425
            *split_parsed_output_files,
426
            self.combined_parsed_path(),
427
            '--top-score-metric', self.top_score_metric,
428
        ]
429
        if self.input_file_type == 'fasta':
5✔
430
            params.extend(['--file-type', 'pVACbind'])
5✔
431
        elif self.input_file_type == 'junctions':
5✔
432
            params.extend(['--file-type', 'pVACsplice'])
5✔
433
        pvactools.lib.combine_parsed_outputs.main(params)
5✔
434
        status_message("Completed")
5✔
435

436
    def final_path(self):
5✔
437
        return os.path.join(self.output_dir, self.sample_name+".filtered.tsv")
5✔
438

439
    def execute(self):
5✔
440
        self.print_log()
5✔
441
        self.convert_vcf()
5✔
442
        if self.input_file_type != 'pvacvector_input_fasta':
5✔
443
            self.generate_combined_fasta(self.fasta_file_path())
5✔
444

445
        total_row_count = self.tsv_entry_count()
5✔
446
        if total_row_count == 0:
5✔
447
            print("The TSV file is empty. Please check that the input VCF contains missense, inframe indel, or frameshift mutations.")
×
448
            return
×
449
        chunks = self.split_tsv_file(total_row_count)
5✔
450

451
        self.generate_fasta(chunks)
5✔
452
        self.call_iedb(chunks)
5✔
453
        split_parsed_output_files = self.parse_outputs(chunks)
5✔
454

455
        if len(split_parsed_output_files) == 0:
5✔
456
            status_message("No output files were created. Aborting.")
×
457
            return
×
458

459
        self.combined_parsed_outputs(split_parsed_output_files)
5✔
460

461
        post_processing_params = copy.copy(vars(self))
5✔
462
        post_processing_params['input_file'] = self.combined_parsed_path()
5✔
463
        post_processing_params['file_type'] = self.input_file_type
5✔
464
        post_processing_params['filtered_report_file'] = self.final_path()
5✔
465
        post_processing_params['fasta'] = self.fasta_file_path()
5✔
466
        post_processing_params['run_manufacturability_metrics'] = True
5✔
467
        if self.input_file_type == 'vcf':
5✔
468
            post_processing_params['file_type'] = 'pVACseq'
5✔
469
            post_processing_params['run_coverage_filter'] = True
5✔
470
            post_processing_params['run_transcript_support_level_filter'] = True
5✔
471
        else:
472
            post_processing_params['run_coverage_filter'] = False
×
473
            post_processing_params['run_transcript_support_level_filter'] = False
×
474
        if self.net_chop_method:
5✔
475
            net_chop_epitope_flank_length = 9
5✔
476
            self.generate_combined_fasta(self.net_chop_fasta_file_path(), net_chop_epitope_flank_length)
5✔
477
            post_processing_params['net_chop_fasta'] = self.net_chop_fasta_file_path()
5✔
478
            post_processing_params['run_net_chop'] = True
5✔
479
        else:
480
            post_processing_params['run_net_chop'] = False
5✔
481
        if self.netmhc_stab:
5✔
482
            post_processing_params['run_netmhc_stab'] = True
5✔
483
        else:
484
            post_processing_params['run_netmhc_stab'] = False
5✔
485
        PostProcessor(**post_processing_params).execute()
5✔
486

487
        if self.keep_tmp_files is False:
5✔
488
            shutil.rmtree(self.tmp_dir, ignore_errors=True)
5✔
489

490
class PvacbindPipeline(Pipeline):
5✔
491
    def fasta_entry_count(self):
5✔
492
        with open(self.input_file) as fasta_file:
5✔
493
            row_count = 0
5✔
494
            for row in fasta_file:
5✔
495
                if not row.startswith('>'):
5✔
496
                    row_count += 1
5✔
497
        return row_count
5✔
498

499
    def fasta_basename(self, length):
5✔
500
        return os.path.join(self.tmp_dir, "{}.{}.fa".format(self.sample_name, length))
5✔
501

502
    def split_fasta_basename(self, length):
5✔
503
        return "{}.split".format(self.fasta_basename(length))
5✔
504

505
    def uniquify_records(self, records):
5✔
506
        fasta_sequences = OrderedDict()
5✔
507
        ids = []
5✔
508
        for record in records:
5✔
509
            if record.id in ids:
5✔
510
                raise Exception("Duplicate fasta header {}. Please ensure that the input FASTA uses unique headers.".format(record.id))
5✔
511
            fasta_sequences.setdefault(str(record.seq), []).append(record.id)
5✔
512
            ids.append(record.id)
5✔
513
        count = 1
5✔
514
        uniq_records = []
5✔
515
        keys = {}
5✔
516
        for sequence, ids in fasta_sequences.items():
5✔
517
            record = SeqRecord(Seq(sequence), id=str(count), description=str(count))
5✔
518
            uniq_records.append(record)
5✔
519
            keys[count] = ids
5✔
520
            count += 1
5✔
521
        return (uniq_records, keys)
5✔
522

523
    def create_per_length_fasta_and_process_stops(self, length):
5✔
524
        stop_chars = set('X*')
5✔
525
        supported_aas = supported_amino_acids()
5✔
526
        records = []
5✔
527
        for record in SeqIO.parse(self.input_file, "fasta"):
5✔
528
            sequence = str(record.seq).upper()
5✔
529
            x_index = sequence.index('X') if 'X' in sequence else len(sequence)
5✔
530
            star_index = sequence.index('*') if '*' in sequence else len(sequence)
5✔
531
            sequence = sequence[0:min(x_index, star_index)]
5✔
532
            if not all([c in supported_aas for c in sequence]):
5✔
533
                logging.warning("Record {} contains unsupported amino acids. Skipping.".format(record.id))
5✔
534
                continue
5✔
535
            if len(sequence) >= length:
5✔
536
                record.seq = Seq(sequence)
5✔
537
                records.append(record)
5✔
538
        SeqIO.write(records, self.fasta_basename(length), "fasta")
5✔
539

540
    def split_fasta_file(self, length):
5✔
541
        fasta_entry_count = self.fasta_entry_count()
5✔
542
        status_message("Splitting FASTA into smaller chunks")
5✔
543
        chunks = []
5✔
544
        peptides = []
5✔
545
        row_count   = 1
5✔
546
        split_start = row_count
5✔
547
        split_end   = split_start + self.fasta_size - 1
5✔
548
        if split_end > fasta_entry_count:
5✔
549
            split_end = fasta_entry_count
5✔
550
        status_message("Splitting FASTA into smaller chunks - Entries %d-%d" % (split_start, split_end))
5✔
551
        split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(length), split_start, split_end)
5✔
552
        split_fasta_key_file_path = "{}.key".format(split_fasta_file_path)
5✔
553
        chunks.append([split_start, split_end])
5✔
554
        if os.path.exists(split_fasta_file_path):
5✔
555
            status_message("Split FASTA file for Entries %d-%d already exists. Skipping." % (split_start, split_end))
5✔
556
            skip = 1
5✔
557
        else:
558
            split_fasta_records = []
5✔
559
            skip = 0
5✔
560
        for record in SeqIO.parse(self.fasta_basename(length), "fasta"):
5✔
561
            if skip == 0:
5✔
562
                split_fasta_records.append(record)
5✔
563
            if row_count == fasta_entry_count:
5✔
564
                break
5✔
565
            if row_count % self.fasta_size == 0:
5✔
566
                if skip == 0:
×
567
                    (uniq_records, keys) = self.uniquify_records(split_fasta_records)
×
568
                    with open(split_fasta_file_path, 'w') as split_fasta_file:
×
569
                        SeqIO.write(uniq_records, split_fasta_file, "fasta")
×
570
                    with open(split_fasta_key_file_path, 'w') as split_fasta_key_file:
×
571
                        yaml.dump(keys, split_fasta_key_file, default_flow_style=False)
×
572
                    split_fasta_file.close()
×
573
                split_start = row_count + 1
×
574
                split_end   = split_start + self.fasta_size - 1
×
575
                if split_end > fasta_entry_count:
×
576
                    split_end = fasta_entry_count
×
577
                status_message("Splitting FASTA into smaller chunks - Entries %d-%d" % (split_start, split_end))
×
578
                split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(length), split_start, split_end)
×
579
                split_fasta_key_file_path = "{}.key".format(split_fasta_file_path)
×
580
                chunks.append([split_start, split_end])
×
581
                if os.path.exists(split_fasta_file_path):
×
582
                    status_message("Split FASTA file for Entries %d-%d already exists. Skipping." % (split_start, split_end))
×
583
                    skip = 1
×
584
                else:
585
                    split_fasta_records = []
×
586
                    skip = 0
×
587
            row_count += 1
5✔
588
        if skip == 0:
5✔
589
            (uniq_records, keys) = self.uniquify_records(split_fasta_records)
5✔
590
            with open(split_fasta_file_path, 'w') as split_fasta_file:
5✔
591
                SeqIO.write(uniq_records, split_fasta_file, "fasta")
5✔
592
            with open(split_fasta_key_file_path, 'w') as split_fasta_key_file:
5✔
593
                yaml.dump(keys, split_fasta_key_file, default_flow_style=False)
5✔
594
        status_message("Completed")
5✔
595
        return chunks
5✔
596

597
    def call_iedb(self, chunks, length):
5✔
598
        alleles = self.alleles
5✔
599
        prediction_algorithms = self.prediction_algorithms
5✔
600
        argument_sets = []
5✔
601
        warning_messages = []
5✔
602
        for (split_start, split_end) in chunks:
5✔
603
            tsv_chunk = "%d-%d" % (split_start, split_end)
5✔
604
            if self.input_file_type == 'fasta' or self.input_file_type == 'junctions':
5✔
605
                fasta_chunk = tsv_chunk
5✔
606
            else:
607
                fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
×
608
            for a in alleles:
5✔
609
                split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(length), fasta_chunk)
5✔
610
                if os.path.getsize(split_fasta_file_path) == 0:
5✔
611
                    msg = "Fasta file {} is empty. Skipping".format(split_fasta_file_path)
5✔
612
                    if msg not in warning_messages:
5✔
613
                        warning_messages.append(msg)
5✔
614
                    continue
5✔
615
                #begin of per-algorithm processing
616
                for method in prediction_algorithms:
5✔
617
                    prediction_class = globals()[method]
5✔
618
                    prediction = prediction_class()
5✔
619
                    if hasattr(prediction, 'iedb_prediction_method'):
5✔
620
                        iedb_method = prediction.iedb_prediction_method
5✔
621
                    else:
622
                        iedb_method = method
×
623
                    valid_alleles = prediction.valid_allele_names()
5✔
624
                    if a not in valid_alleles:
5✔
625
                        msg = "Allele %s not valid for Method %s. Skipping." % (a, method)
5✔
626
                        if msg not in warning_messages:
5✔
627
                            warning_messages.append(msg)
5✔
628
                        continue
5✔
629
                    valid_lengths = prediction.valid_lengths_for_allele(a)
5✔
630
                    if length not in valid_lengths:
5✔
631
                        msg = "Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % (length, method, a)
×
632
                        if msg not in warning_messages:
×
633
                            warning_messages.append(msg)
×
634
                        continue
×
635

636
                    split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(length), "tsv_%s" % fasta_chunk]))
5✔
637
                    if os.path.exists(split_iedb_out):
5✔
638
                        msg = "Prediction file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % (a, length, method, fasta_chunk)
5✔
639
                        if msg not in warning_messages:
5✔
640
                            warning_messages.append(msg)
5✔
641
                        continue
5✔
642
                    arguments = [
5✔
643
                        split_fasta_file_path,
644
                        split_iedb_out,
645
                        method,
646
                        a,
647
                        '-r', str(self.iedb_retries),
648
                        '-e', self.iedb_executable,
649
                        '-l', str(length),
650
                        '--tmp-dir', self.tmp_dir,
651
                        '--log-dir', self.log_dir(),
652
                    ]
653
                    argument_sets.append(arguments)
5✔
654

655
        for msg in warning_messages:
5✔
656
            status_message(msg)
5✔
657

658
        with pymp.Parallel(self.n_threads) as p:
5✔
659
            for index in p.range(len(argument_sets)):
5✔
660
                arguments = argument_sets[index]
5✔
661
                a = arguments[3]
5✔
662
                method = arguments[2]
5✔
663
                filename = arguments[1]
5✔
664
                epl = arguments[9]
5✔
665
                p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s" % (a, epl, method, filename))
5✔
666
                pvactools.lib.call_iedb.main(arguments)
5✔
667
                p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s - Completed" % (a, epl, method, filename))
5✔
668

669
    def parse_outputs(self, chunks, length):
5✔
670
        split_parsed_output_files = []
5✔
671
        for (split_start, split_end) in chunks:
5✔
672
            tsv_chunk = "%d-%d" % (split_start, split_end)
5✔
673
            if self.input_file_type == 'fasta' or self.input_file_type == 'junctions':
5✔
674
                fasta_chunk = tsv_chunk
5✔
675
            else:
676
                fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
×
677
            for a in self.alleles:
5✔
678
                split_iedb_output_files = []
5✔
679
                for method in self.prediction_algorithms:
5✔
680
                    prediction_class = globals()[method]
5✔
681
                    prediction = prediction_class()
5✔
682
                    if hasattr(prediction, 'iedb_prediction_method'):
5✔
683
                        iedb_method = prediction.iedb_prediction_method
5✔
684
                    else:
685
                        iedb_method = method
×
686
                    valid_alleles = prediction.valid_allele_names()
5✔
687
                    if a not in valid_alleles:
5✔
688
                        continue
5✔
689
                    valid_lengths = prediction.valid_lengths_for_allele(a)
5✔
690
                    if length not in valid_lengths:
5✔
691
                        continue
×
692
                    split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(length), "tsv_%s" % fasta_chunk]))
5✔
693
                    if os.path.exists(split_iedb_out):
5✔
694
                        split_iedb_output_files.append(split_iedb_out)
5✔
695

696
                split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, str(length), "parsed", "tsv_%s" % fasta_chunk]))
5✔
697
                if os.path.exists(split_parsed_file_path):
5✔
698
                    status_message("Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, length, fasta_chunk))
5✔
699
                    split_parsed_output_files.append(split_parsed_file_path)
5✔
700
                    continue
5✔
701
                split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(length), fasta_chunk)
5✔
702
                split_fasta_key_file_path = split_fasta_file_path + '.key'
5✔
703

704
                if len(split_iedb_output_files) > 0:
5✔
705
                    status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s" % (a, length, fasta_chunk))
5✔
706
                    split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk)
5✔
707
                    params = {
5✔
708
                        'input_iedb_files'       : split_iedb_output_files,
709
                        'input_tsv_file'         : split_tsv_file_path,
710
                        'key_file'               : split_fasta_key_file_path,
711
                        'output_file'            : split_parsed_file_path,
712
                    }
713
                    if self.input_file_type == 'junctions':
5✔
714
                        params['input_tsv_file'] = self.tsv_file_path()
5✔
715
                    params['sample_name'] = self.sample_name
5✔
716
                    params['flurry_state'] = self.flurry_state
5✔
717
                    if self.additional_report_columns and 'sample_name' in self.additional_report_columns:
5✔
718
                        params['add_sample_name_column'] = True 
×
719
                    parser = self.output_parser(params)
5✔
720
                    parser.execute()
5✔
721
                    status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s - Completed" % (a, length, fasta_chunk))
5✔
722

723
                    split_parsed_output_files.append(split_parsed_file_path)
5✔
724
        return split_parsed_output_files
5✔
725

726
    def execute(self):
5✔
727
        self.print_log()
5✔
728

729
        split_parsed_output_files = []
5✔
730
        for length in self.epitope_lengths:
5✔
731
            self.create_per_length_fasta_and_process_stops(length)
5✔
732
            chunks = self.split_fasta_file(length)
5✔
733
            self.call_iedb(chunks, length)
5✔
734
            split_parsed_output_files.extend(self.parse_outputs(chunks, length))
5✔
735

736
        if len(split_parsed_output_files) == 0:
5✔
737
            status_message("No output files were created. Aborting.")
5✔
738
            return
5✔
739

740
        self.combined_parsed_outputs(split_parsed_output_files)
5✔
741

742
        if not self.run_post_processor:
5✔
743
            return
5✔
744

745
        post_processing_params = copy.copy(vars(self))
5✔
746
        post_processing_params['input_file'] = self.combined_parsed_path()
5✔
747
        post_processing_params['file_type'] = 'pVACbind'
5✔
748
        post_processing_params['filtered_report_file'] = self.final_path()
5✔
749
        post_processing_params['run_coverage_filter'] = False
5✔
750
        post_processing_params['run_transcript_support_level_filter'] = False
5✔
751
        post_processing_params['minimum_fold_change'] = None
5✔
752
        post_processing_params['run_manufacturability_metrics'] = True
5✔
753
        post_processing_params['fasta'] = self.input_file
5✔
754
        if self.net_chop_method:
5✔
755
            post_processing_params['net_chop_fasta'] = self.net_chop_fasta
5✔
756
            post_processing_params['run_net_chop'] = True
5✔
757
        else:
758
            post_processing_params['run_net_chop'] = False
5✔
759
        if self.netmhc_stab:
5✔
760
            post_processing_params['run_netmhc_stab'] = True
5✔
761
        else:
762
            post_processing_params['run_netmhc_stab'] = False
5✔
763
        PostProcessor(**post_processing_params).execute()
5✔
764

765
        if self.keep_tmp_files is False:
5✔
766
            shutil.rmtree(self.tmp_dir, ignore_errors=True)
×
767

768

769
class PvacsplicePipeline(PvacbindPipeline):
5✔
770
    def execute(self):
5✔
771
        self.print_log()
5✔
772

773
        # mv fasta file to temp dir
774
        shutil.copy(self.input_file, os.path.join(self.tmp_dir, os.path.basename(self.input_file)))
5✔
775

776
        split_parsed_output_files = []
5✔
777
        chunks = self.split_fasta_file(self.epitope_lengths)
5✔
778
        self.call_iedb(chunks, self.epitope_lengths)
5✔
779
        # parse iedb output files
780
        split_parsed_output_files.extend(self.parse_outputs(chunks, self.epitope_lengths))  # chunks - list of lists
5✔
781

782
        if len(split_parsed_output_files) == 0:
5✔
NEW
783
            status_message("No output files were created. Aborting.")
×
NEW
784
            return
×
785

786
        # creates all_epitopes.tsv
787
        self.combined_parsed_outputs(split_parsed_output_files)
5✔
788

789
        if self.keep_tmp_files is False:
5✔
NEW
790
            shutil.rmtree(self.tmp_dir, ignore_errors=True)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc