• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

X-lab-3D / PANDORA / 14772711772

01 May 2025 08:50AM UTC coverage: 53.813% (-3.2%) from 56.981%
14772711772

Pull #302

github

web-flow
Merge pull request #300 from X-lab-3D/issue_297

Issue_297
Pull Request #302: Development

30 of 258 new or added lines in 5 files covered. (11.63%)

70 existing lines in 3 files now uncovered.

1849 of 3436 relevant lines covered (53.81%)

0.54 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

40.55
/PANDORA/Pandora/Modelling_functions.py
1
from Bio.Align import substitution_matrices
1✔
2
import os
1✔
3
import traceback
1✔
4
import subprocess
1✔
5
import PANDORA
1✔
6
import pickle
1✔
7
from PANDORA import Model
1✔
8
# from Bio import Align
9
from Bio import pairwise2
1✔
10
#from PANDORA import Align
11
#import statistics
12
from Bio.Align import PairwiseAligner
1✔
13
from datetime import datetime
1✔
14
from copy import deepcopy
1✔
15
import re
1✔
16

17

18
def check_target_template(target, template):
1✔
19
    """ Checks if the target and the template are the same. If the user gave sequence info in the target, use that, else
20
        use the allele type.
21

22
    Args:
23
        target: (:obj:`Target`): Target object
24
        template: (:obj:`Template`): Template object
25

26
    Returns: (bool): True if target/template are the same, False if they are not.
27

28
    """
29
    out = False
1✔
30
    # Check if target peptide and template peptide are the same
31
    if target.peptide == template.peptide:
1✔
32
        # If the target has no sequence information, use allele type
33
        if target.M_chain_seq == '':
×
34
            # Check if the allele of target and template are the same
35
            if any(x in template.allele_type for x in target.allele_type):
×
36
                out = True
×
37

38
        # If the target has sequence information..
39
        elif target.M_chain_seq != '':
×
40
            # For MHCI, check if the M chain sequence of target and template are the same
41
            if target.MHC_class == 'I':
×
42
                if target.M_chain_seq == template.M_chain_seq:
×
43
                    out = True
×
44
            # For MHCII, check if the M and N chain sequence of target and template are the same
45
            elif target.MHC_class == 'II' and target.N_chain_seq != '':
×
46
                if target.M_chain_seq == template.M_chain_seq and target.N_chain_seq == template.N_chain_seq:
×
47
                    out = True
×
48
    if out:
1✔
49
        print('\n\t---- THE TARGET HAS THE SAME PEPTIDE AND ALLELE/SEQUENCE INFORMATION AS THE TEMPLATE ----')
×
50
        print('\tYou can find it at: http://www.imgt.org/3Dstructure-DB/cgi/details.cgi?pdbcode=%s\n' %(template.id))
×
51

52
    return out
1✔
53

54

55
def check_presence(target, database, seq_based_templ_selection = False):
1✔
56
    ''' Checks if the target the user submitted, already exists in has a template in the database with the same allele
57
        and peptide.
58

59
    Args:
60
        target: Target object
61
        database: Database object
62
        seq_based_templ_selection: bool, select the template based on the chain sequences.
63

64
    Returns: bool/Template object. If the target is already in the db, return the Template, otherwise return False
65

66
    '''
67
    putative_templates = []
×
68
    target_in_db = False
×
69
    if not seq_based_templ_selection:
×
70
        # For MHC I
71
        if target.MHC_class == 'I':
×
72
            # Check if there are templates with the same alleles
73
            for id in database.MHCI_data:
×
74
                if any(x in database.MHCI_data[id].allele_type for x in target.allele_type):
×
75
                    putative_templates.append(id)
×
76
            # Check if there is a putative template that also has the same peptide as the target
77
            for i in putative_templates:
×
78
                if database.MHCI_data[i].peptide == target.peptide:
×
79
                    target_in_db = database.MHCI_data[i]
×
80
        # For MHC II
81
        elif target.MHC_class == 'II':
×
82
            # Check if there are templates with the same alleles
83
            for id in database.MHCII_data:
×
84
                if any(x in database.MHCII_data[id].allele_type for x in target.allele_type):
×
85
                    putative_templates.append(id)
×
86
            # Check if there is a putative template that also has the same peptide as the target
87
            for i in putative_templates:
×
88
                if database.MHCII_data[i].peptide == target.peptide:
×
89
                    target_in_db = database.MHCII_data[i]
×
90

91
    elif seq_based_templ_selection:
×
92
        # Check for MHC I
93
        if target.MHC_class == 'I':
×
94
            # Check if there are templates with the same M chain sequence
95
            for id in database.MHCI_data:
×
96
                if database.MHCI_data[id].M_chain_seq == target.M_chain_seq:
×
97
                    putative_templates.append(id)
×
98
            # Check if there is a putative template that also has the same peptide as the target
99
            for i in putative_templates:
×
100
                if database.MHCI_data[i].peptide == target.peptide:
×
101
                    target_in_db = database.MHCI_data[i]
×
102
        # Check for MHC I
103
        if target.MHC_class == 'II':
×
104
            # Check if there are templates with the same M chain sequence
105
            for id in database.MHCII_data:
×
106
                if database.MHCII_data[id].M_chain_seq == target.M_chain_seq:
×
107
                    if database.MHCII_data[id].N_chain_seq == target.N_chain_seq:
×
108
                        putative_templates.append(id)
×
109
            # Check if there is a putative template that also has the same peptide as the target
110
            for i in putative_templates:
×
111
                if database.MHCII_data[i].peptide == target.peptide:
×
112
                    target_in_db = database.MHCII_data[i]
×
113

114
    return target_in_db
×
115

116

117
def predict_anchors_netMHCIIpan(peptide, allele_type, output_dir, verbose=True, rm_netmhcpan_output=True):
1✔
118
    '''Uses netMHCIIpan to predict the binding core of a peptide and infer the anchor positions from that.
119

120
    Args:
121
        peptide: (str): AA sequence of the peptide
122
        allele_type: (lst): list of strings of allele types
123
        output_dir: (string) Path to output directory 
124
        verbose: (bool): Print information. Default = True
125
        rm_netmhcpan_output: (bool): If True, removes the netmhcpan infile and outfile after having used them for netmhcpan.
126

127
    Returns: (lst): list of predicted anchor predictions
128

129
    '''
130
    
131
    # Retrieves the enviroment variable netMHCIIpan
132
    netmhcpan_file_path = set([x for x in [os.getenv('netMHCIIpan', default=None), 
×
133
                          os.popen('which netMHCIIpan').read().strip()] 
134
                          if type(x) == str])
135
    try:
×
136
        netmhcpan_file_path = netmhcpan_file_path.pop()
×
137
    except:
×
138
        raise Exception("Need netMHCIIpan to predict anchor positions. Please download and install netMHCIIpan.\n\n"
×
139
        "You can request the software at https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 in the 'Downloads' section.\n"
140
        "After installing netMHCpan, make sure it's added to your PATH or as an alias to your .bashrc / .bash_profile.\n")
141
        
142
    netmhcpan_path = os.path.dirname(netmhcpan_file_path)
×
143

144
    all_netMHCpan_alleles = []
×
145
    with open(os.path.join(netmhcpan_path, 'data/allelelist.txt')) as f:
×
146
        for line in f:
×
147
            all_netMHCpan_alleles.append(line.split()[0].replace('\n', ''))
×
148

149
    # Format the alles to netMHCIIpan readable format
NEW
150
    target_alleles = [i.split('-')[-1].replace('*', '_').replace(':','') for i in allele_type]
×
151

152
    # The DQ and DP alleles only function in pairs in netMHCIIpan, which we cannot match from our alleles
153
    # So take the first 3 partially matched allele combinations
154
    for i in target_alleles:
×
155
        if 'DRB' not in i:
×
156
            target_alleles = target_alleles + [al for al in all_netMHCpan_alleles if i.replace('_', '') in al][:3]
×
157

158
    # for the DQ and DP cases, alleles are matched (e.g. 'HLA-DQA10102-DQB10602')
159
    # If two alleles are present is such a combi case, select that combi case as the target allele
160
    target_alleles_matched = []
×
161
    for al in target_alleles:
×
162
        hits = 0
×
163
        for part in [i.split('*')[-1] for i in allele_type]:
×
164
            if part in al:
×
165
                hits +=1
×
166
        if hits == 2:
×
167
            target_alleles_matched.append(al)
×
168
    if len(target_alleles_matched) > 0:
×
169
        target_alleles = target_alleles_matched
×
170

171
    target_alleles = [i for i in target_alleles if i in all_netMHCpan_alleles]
×
172
    # If there are no target alleles that occur in netMHCIIpan, but there is a mouse allele, use all mouse alleles
173
    # that are supported by netMHCIIpan
174
    if target_alleles == [] and any(al.startswith('H2') for al in allele_type):
×
175
        target_alleles = [i for i in all_netMHCpan_alleles if i.startswith('H-')]
×
176

177
    # If there is no target allele that occurs in netMHCIIpan, raise an Exception
178
    if target_alleles == []:
×
179
        #target_alleles = ['DRB1_0101']
NEW
180
        raise Exception('ERROR: Provided allele is not available in netMHCIIpan-4.1.\n')
×
181

182
    target_alleles_str = ','.join(target_alleles)
×
183

184
    # Setup files
185
    infile = os.path.join(output_dir,f'{peptide}_{target_alleles[0].replace("*","").replace(":","")}_{datetime.today().strftime("%Y%m%d_%H%M%S")}.txt')
×
186
    outfile = os.path.join(output_dir, f'{peptide}_{target_alleles[0].replace("*","").replace(":","")}_{datetime.today().strftime("%Y%m%d_%H%M%S")}_prediction.txt')
×
187

188
    # Write peptide sequence to input file for netMHCIIpan
189
    with open(infile, 'w') as f:
×
190
        f.write(peptide)
×
191

192
    try:
×
193
        # run netMHCIIpan
194
        subprocess.check_call('%s -f %s -inptype 1 -a %s > %s' % (netmhcpan_file_path, infile, target_alleles_str, outfile), shell=True)
×
195

196
        # Get the output from the netMHCIIpan prediction
197
        # {allele: (offset, core, core_reliability, score_EL, %rank_EL)}
198
        pred = {}
×
199
        with open(outfile) as f:
×
200
            for line in f:
×
201
                if peptide in line and not line.startswith('#'):
×
202
                    ln = [i for i in line[:-1].split(' ') if i != '']
×
203
                    pred[ln[1]] = (int(ln[3]), ln[4], float(ln[5]))
×
204

205
        # For each predicted core offset, show the best prediction
206
        max_scores = [max((i[::-1]) for i in list(pred.values()) if i[0] == s) for s in set([pred[i][0] for i in pred])]
×
207
        # order to offset, core, core_reliability
208
        max_scores = [i[::-1] for i in sorted(max_scores, reverse=True)]
×
209

210
    except ValueError:
×
211
        print('Could not predict binding core using netMHCIIpan. Will use the most common anchor positions instead')
×
212
        return [3, 6, 8, 11]
×
213

214
    offset, core, core_reliability = max_scores[0]
×
215
    # Use the canonical spacing for 9-mer binding cores to predict the anchor positions
216
    predicted_anchors = [offset + 1, offset + 4, offset + 6, offset + 9]
×
217
    # Make sure the prediction is not longer than the peptide just in case
218
    predicted_anchors = [i for i in predicted_anchors if i <= len(peptide)]
×
219

220
    if verbose:
×
221
        print('\tPredicted the binding core using netMHCIIpan (4.0):\n')
×
222
        print('\toffset:\t%s\n\tcore:\t%s\n\tprob:\t%s\n' % (offset, core, core_reliability))
×
223
        print('\tPredicted peptide anchor residues (assuming canonical spacing): %s' % predicted_anchors)
×
224

225
    if rm_netmhcpan_output:
×
226
        subprocess.check_call('rm %s' %infile, shell=True)
×
227
        subprocess.check_call('rm %s' %outfile, shell=True)
×
228

229
    return predicted_anchors
×
230
   
231

232
def predict_anchors_netMHCpan(peptide, allele_type, output_dir, verbose=True, rm_netmhcpan_output=True):
1✔
233
    '''Uses netMHCIpan to predict the binding core of a peptide and infer the
234
    anchor positions from that.
235

236
    Args:
237
        peptide: (str): AA sequence of the peptide
238
        allele_type: (lst): list of strings of allele types
239
        output_dir: (string) Path to output directory
240
        verbose: (bool): Print information. Default = True
241
        rm_netmhcpan_output: (bool): If True, removes the netmhcpan infile and outfile after having used them for netmhcpan.
242

243
    Returns: (lst): list of predicted anchor predictions
244

245
    '''
246
    
247
    # Retrieves the enviroment variable netMHCpan
248
    netmhcpan_file_path = set([x for x in [os.getenv('netMHCpan', default=None), 
×
249
                          os.popen('which netMHCpan').read().strip()] 
250
                          if type(x) == str])
251
    try:
×
252
        netmhcpan_file_path = netmhcpan_file_path.pop()
×
253
    except:
×
254
        raise Exception("Need netMHCpan to predict anchor positions. Please download and install netMHCpan.\n\n"
×
255
        "You can request the software at https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 in the 'Downloads' section.\n"
256
        "After installing netMHCpan, make sure it's added to your PATH or as an alias to your .bashrc / .bash_profile.\n")
257
    
258
    netmhcpan_path = os.path.dirname(netmhcpan_file_path)
×
259

260
    all_netMHCpan_alleles = []
×
261
    with open(os.path.join(netmhcpan_path, 'data/allelenames')) as f:
×
262
        for line in f:
×
263
            all_netMHCpan_alleles.append(line.split()[0])#.replace(':',''))
×
264
        
265
    ## Format alleles
266
    if any(x.startswith('HLA') for x in allele_type):
×
267
        target_alleles = [i.replace('*','') for i in allele_type]
×
268
    elif any(x.startswith('BoLA') for x in allele_type):
×
269
        target_alleles = [i.replace(':','').replace('*',':') for i in allele_type]
×
270
    elif any(x.startswith('DLA') for x in allele_type):
×
271
        target_alleles = [i.replace(':','').replace('*','') for i in allele_type]
×
272
    elif any(x.startswith('Eqca') for x in allele_type):
×
273
        target_alleles = [i.replace(':','').replace('*','') for i in allele_type]
×
274
    elif any(x.startswith('Gogo') for x in allele_type):
×
275
        target_alleles = [i.replace(':','').replace('*','') for i in allele_type]
×
276
    elif any(x.startswith('Mamu') for x in allele_type):
×
277
        target_alleles = [i.replace(':','').replace('*',':') for i in allele_type]
×
278
    elif any(x.startswith('Patr') for x in allele_type):
×
279
        target_alleles = [i.replace(':','').replace('*','') for i in allele_type]
×
280
    elif any(x.startswith('SLA') for x in allele_type):
×
281
        target_alleles = [i.replace(':','').replace('*',':') for i in allele_type]
×
282
    
283
    ## Make sure only netMHCpan available alleles are used
284
    target_alleles = [i for i in target_alleles if i in all_netMHCpan_alleles]
×
285
    
286
    if len(target_alleles) == 0:
×
287
        print('ERROR: The provided Target allele is not available in NetMHCpan-4.1')
×
288
        return None
×
289
        
290
    target_alleles_str = ','.join(target_alleles)
×
291
        
292
    # Setup files
293
    infile = os.path.join(output_dir,f'{peptide}_{target_alleles[0].replace("*","").replace(":","")}_{datetime.today().strftime("%Y%m%d_%H%M%S")}.txt')
×
294
    outfile = os.path.join(output_dir, f'{peptide}_{target_alleles[0].replace("*","").replace(":","")}_{datetime.today().strftime("%Y%m%d_%H%M%S")}_prediction.txt')
×
295

296
    # Write peptide sequence to input file for netMHCIIpan
297
    with open(infile, 'w') as f:
×
298
        f.write(peptide)
×
299

300
    subprocess.check_call('%s -p %s -a %s > %s' %(netmhcpan_file_path, infile, target_alleles_str, outfile), shell=True)
×
301
        
302
    # Get the output from the netMHCIIpan prediction
303
    # {allele: (core, %rank_EL)}
304
    pred = {}
×
305
    with open(outfile) as f:
×
306
        for line in f:
×
307
            if peptide in line and not line.startswith('#'):
×
308
                ln = [i for i in line[:-1].split(' ') if i != '']
×
309
                #ln[3] is core, ln[9] is Icore
310
                try:
×
311
                    pred[ln[1]].append((ln[3], float(ln[12])))
×
312
                except KeyError:
×
313
                    pred[ln[1]] = [(ln[3], float(ln[12]))]
×
314

315
    # Sort each allele result per Rank_EL
316
    for allele in pred:
×
317
        pred[allele] = list(sorted(pred[allele], key=lambda x:x[1]))
×
318
        
319
    if len(pred) == 0:
×
320
        print('ERROR: NetMHCpan-4.1 was not able to find any binding core for')
×
321
        print('the provided peptide and MHC allele')
×
322
        return None
×
323
        
324
    # For every allele, the binding core is predicted. Take the allele with the highest reliability score
325
    best_allele = min((pred[i][0][1], i) for i in pred)[1]
×
326
        
327
    # Do a quick alignment of the predicted core and the peptide to find the anchors. (the predicted binding core can
328
    # contain dashes -. Aligning them makes sure you take the right residue as anchor.
329
    alignment = pairwise2.align.globalxx(peptide, pred[best_allele][0][0])
×
330
        
331
    #If there are multiple possible solutions, take the one with no gap at the anchor (second) position
332
    if len(alignment)>1:
×
333
        flag = False
×
334
        #Search for the options without gap in the second postions
335
        for prediction in alignment:
×
336
            if prediction[1][1] != '-' and prediction[0][1] != '-':
×
337
                pept1 = prediction[0]
×
338
                pept2 = prediction[1]
×
339
                flag = True
×
340
                break
×
341
        #If no options are available, take the first one
342
        if flag==False:
×
343
            pept1 = alignment[0][0]
×
344
            pept2 = alignment[0][1]
×
345
            
346
    else:
347
        pept1 = alignment[0][0]
×
348
        pept2 = alignment[0][1]
×
349
        
350
    # Remove gaps if in the same position
351
    to_remove = []
×
352
    for i, (aa1, aa2) in enumerate(zip(pept1, pept2)):
×
353
        if aa1 == aa2 == '-' and i != 0:
×
354
            to_remove.append(i)
×
355
    for x in reversed(to_remove):
×
356
        pept1 = pept1[0:x:]+pept1[x+1::]
×
357
        pept2 = pept2[0:x:]+pept2[x+1::]
×
358

359
    if verbose:
×
360
        print('Query peptide aligned to the core:')
×
361
        print(pept1)
×
362
        print(pept2)
×
363
        
364
    # Find the anchors by finding the first non dash from the left and from the right
365
    # Define chanonical ancors as starting list
366
    predicted_anchors = [2,len(peptide)]
×
367

368
    # Find the first anchor
369
    p1 = 0
×
370
    p2 = 0
×
371
    for i in range(len(pept2)):
×
372
        # if the second position has no gaps
373
        if i == 1 and pept2[i] != '-' and pept1[i] != '-':
×
374
            predicted_anchors[0] = p1 + 1
×
375
            break
×
376
        elif i > 1 and pept2[i] != '-':
×
377
            predicted_anchors[0] = p1 + 1
×
378
            break
×
379
        if pept1[i] != '-':
×
380
            p1 += 1
×
381
        if pept2[i] != '-':
×
382
            p2 += 1
×
383
        
384
    # Find the second anchor
385
    for i in range(len(pept2)):
×
386
        if pept2[::-1][i] != '-':
×
387
            predicted_anchors[1] = len([j for j in pept1[:len(pept1) -i] if j != '-'])
×
388
            #predicted_anchors[1] = len([j for j in pept2[::-1][i] if j != '-'])
389
            break
×
390
        
391
    if verbose:
×
392
        print('\tPredicted the binding core using netMHCpan (4.1):\n')
×
393
        print('\tIcore:\t%s\n\t%%Rank EL:\t%s\n' %(pred[best_allele][0][0], pred[best_allele][0][1] ))
×
394
        print('\tPredicted peptide anchor residues (assuming canonical spacing): %s' %predicted_anchors)
×
395
        
396
    if rm_netmhcpan_output:
×
397
        subprocess.check_call('rm %s' %infile, shell=True)
×
398
        subprocess.check_call('rm %s' %outfile, shell=True)
×
399
    
400
    return predicted_anchors
×
401

402

403
def score_peptide_alignment(target, template, substitution_matrix='PAM30'):
1✔
404
    ''' Calculate the alignment score of the target and template peptide
405

406
    Args:
407
        target: (Target): Target object
408
        template: (Template): Template object
409
        substitution_matrix: (str): name of subtitution matrix, default is PAM30 (BLOSUM80 etc)
410

411
    Returns: (flt): alignment score
412

413
    '''
414
    if target.MHC_class == 'I':
1✔
415
        substitution_matrix = substitution_matrices.load(substitution_matrix)
1✔
416
        score = 0
1✔
417
        try:
1✔
418
            pept_anchs = target.anchors
1✔
419
        except:
×
420
            pept_anchs = [1, len(target.peptide) - 1]
×
421
    
422
        temp_pept = template.peptide
1✔
423
        temp_anchs = template.anchors
1✔
424
        aligned_pept, aligned_temp_pept = align_peptides(target.peptide,
1✔
425
                                                         pept_anchs[0], pept_anchs[1],
426
                                                         temp_pept,
427
                                                         temp_anchs[0], temp_anchs[1])
428
    
429
        aligned_pept = aligned_pept.replace('-', '*')
1✔
430
        aligned_temp_pept = aligned_temp_pept.replace('-', '*')
1✔
431
        # min_len = min([len(target.peptide), len(temp_pept)])
432
        # score -= ((abs(len(target.peptide) - len(temp_pept)) ** 2.4))  # !!!  ## Gap Penalty #Is now handled by normal PAM30
433
        for i, (aa, bb) in enumerate(zip(aligned_pept, aligned_temp_pept)):
1✔
434
            try:
1✔
435
                # gain = MatrixInfo.pam30[aa, bb]
436
                gain = substitution_matrix[aa, bb]
1✔
437
                score += gain
1✔
438
            except KeyError:
×
439
                try:
×
440
                    # gain = MatrixInfo.pam30[bb, aa]
441
                    gain = substitution_matrix[bb, aa]
×
442
                    score += gain
×
443
                except KeyError:
×
444
                    score = -50
×
445
                    pass
×
446
    
447
        return score
1✔
448
    
449
    elif target.MHC_class == 'II':
1✔
450
        # define the peptide and p1 anchor position
451
        temp_pept = template.peptide
1✔
452
        temp_p1 = template.anchors[0]
1✔
453
        tar_pept = target.peptide
1✔
454
        tar_p1 = target.anchors[0]
1✔
455
    
456
        # align based on first anchor position, fill in the ends with '-' to make them equal length
457
        temp_pept = '*' * (tar_p1 - temp_p1) + temp_pept
1✔
458
        tar_pept = '*' * (temp_p1 - tar_p1) + tar_pept
1✔
459
        temp_pept = temp_pept + '*' * (len(tar_pept) - len(temp_pept))
1✔
460
        tar_pept = tar_pept + '*' * (len(temp_pept) - len(tar_pept))
1✔
461
    
462
        # Perform a pairwise alignment. Make sure no gaps are introduced.
463
        aligner = PairwiseAligner()
1✔
464
        aligner.substitution_matrix = substitution_matrices.load(substitution_matrix)
1✔
465
        aligner.gap_score = -1000
1✔
466
        aligner.end_open_gap_score = -1000000
1✔
467
        aligner.internal_open_gap_score = -10000
1✔
468
    
469
        # Align the sequences
470
        aligned = aligner.align(tar_pept, temp_pept)
1✔
471
    
472
        return aligned.score
1✔
473

474

475
def find_template(target, database, best_n_templates = 1, benchmark=False, 
1✔
476
                  blastdb=PANDORA.PANDORA_data + '/BLAST_databases/templates_blast_db/templates_blast_db'):
477
    ''' Selects the template structure that is best suited as template for homology modelling of the target
478

479
    Args:
480
        target (PMHC.Target): Target object
481
        database (Database.Database): Database object
482
        blastdb (str): Path to blast database to use for sequence-based template selection.
483

484
    Returns: Template object
485

486
    '''
487

488
    putative_templates = {}
1✔
489
    
490
    if target.MHC_class == 'I':
1✔
491
        class_variables = [PANDORA.MHCI_G_domain[0][1], 'MHCI_data', 'M_score']
1✔
492
    elif target.MHC_class == 'II':
1✔
493
        class_variables = [PANDORA.MHCII_G_domain[0][1], 'MHCII_data', 'Avg_score']
1✔
494
        
495
    no_seq_chains = []
1✔
496
    if target.M_chain_seq != '':
1✔
497
        #Sequence based template selection
498
        
499
        # Sequence based template selection
500
        #Keep only G-domain
501
        M_chain = target.M_chain_seq[:class_variables[0]]
1✔
502
        #Blast M chain sequence
503
        try:
1✔
504
            M_chain_result = blast_mhc_seq(M_chain, chain='M', blastdb=blastdb)
1✔
505
            #FIll in putative_templates M identity score
506
            for result in M_chain_result:
1✔
507
                ID = result[0][:4]
1✔
508
                score = result[1]
1✔
509
                putative_templates[ID] = {'M_score': score}
1✔
510
        except Exception as e:
×
511
            print(e)
×
512
            print('WARNING: something went wrong with blast-based template selection.')
×
513
            print('Is blastp properly installed?')
×
514
            #If blast didn't work properly, consider this sequence missing
515
            no_seq_chains.append('M_score')
×
516
    
517
    else:
518
        no_seq_chains.append('M_score')
×
519
        
520
    if target.MHC_class == 'II':
1✔
521
        if target.N_chain_seq != '':
1✔
522
            #Keep only G-domain
523
            N_chain = target.N_chain_seq[:PANDORA.MHCII_G_domain[1][1]]
1✔
524
            #Blast N chain sequence
525
            try:
1✔
526
                N_chain_result = blast_mhc_seq(N_chain, chain='N', blastdb=blastdb)
1✔
527
            
528
                #FIll in putative_templates N  and average identity score
529
                for result in N_chain_result:
1✔
530
                    ID = result[0][:4]
1✔
531
                    score = result[1]
1✔
532
                    try:
1✔
533
                        putative_templates[ID]['N_score']= score
1✔
534
                        #Get average score
535
                    except KeyError:
1✔
536
                        putative_templates[ID] = {'N_score': score}
1✔
537
                        
538
            except Exception as e:
×
539
                print(e)
×
540
                print('WARNING: something went wrong with blast-based template selection.')
×
541
                print('Is blastp properly installed?')
×
542
                #If blast didn't work properly, consider this sequence missing
543
                no_seq_chains.append('N_score')
×
544
            
545
                
546
        else: 
547
            no_seq_chains.append('N_score')
×
548
    
549
    #For every chain withous a seq, fill in the relative score to 100
550
    #For each template with at least one matching allele
551
    if no_seq_chains !=[]:
1✔
552
        for C in no_seq_chains:
×
553
            # Fill in available alleles list
554
            available_alleles = []
×
555
            for ID in getattr(database, class_variables[1]):
×
556
                if benchmark and ID == target.id:
×
557
                    pass
×
558
                else:
559
                    available_alleles.extend(getattr(database, class_variables[1])[ID].allele_type)
×
560
            available_alleles = list(set(available_alleles))
×
561
            
562
            # Find template structures with matching alleles
563
            target_alleles = allele_name_adapter(target.MHC_class, target.allele_type, available_alleles)
×
564
            target_alleles = list(set(target_alleles))
×
565
            for ID in getattr(database, class_variables[1]):
×
566
                if benchmark:
×
567
                    if ID != target.id:
×
568
                        if any(y in x for x in getattr(database, class_variables[1])[ID].allele_type for y in target_alleles):
×
569
                            try:
×
570
                                putative_templates[ID][C] = 100.0
×
571
                            except KeyError:
×
572
                                putative_templates[ID] = {C : 100.0}
×
573
                else:
574
                    if any(y in x for x in getattr(database, class_variables[1])[ID].allele_type for y in target_alleles):
×
575
                        try:
×
576
                            putative_templates[ID][C] = 100.0
×
577
                        except KeyError:
×
578
                            putative_templates[ID]= {C : 100.0}
×
579
    
580
    #Keep only templates present in the template db.
581
    #This prevents errors caused by different blast and pandora db.
582
    putative_templates = {k:v for k,v in putative_templates.items() if k in getattr(database, class_variables[1]).keys()}
1✔
583
    
584
    #Remove target from putative_templates if benchmark run
585
    if benchmark:
1✔
586
        if target.id in putative_templates.keys():
1✔
587
            del putative_templates[target.id]
1✔
588
                                
589
    if target.MHC_class == 'II':
1✔
590
        for ID in putative_templates:
1✔
591
            if len(putative_templates[ID].keys()) == 2:
1✔
592
                putative_templates[ID]['Avg_score'] = (putative_templates[ID]['M_score'] + putative_templates[ID]['N_score']) /2
1✔
593
        
594
        putative_templates = {x : putative_templates[x] for x in putative_templates 
1✔
595
                              if 'Avg_score' in list(putative_templates[x].keys())}
596
         # Make sure there is no template with only 3 anchors for benchmarking.
597
        if benchmark:
1✔
598
            putative_templates = {k:v for k,v in putative_templates.items() if len(database.MHCII_data[k].anchors) == 4}
1✔
599
        
600

601
    # For both chains
602
    #Sort for average score
603
    putative_templates = sorted(putative_templates.items(), 
1✔
604
                                key=lambda x: x[1][class_variables[2]], reverse=True)
605
    putative_templates = {x[0] : x[1] for x in putative_templates}
1✔
606
    #Keep only max score templates
607
    try:
1✔
608
        max_score = list(putative_templates.values())[0][class_variables[2]]
1✔
609
    except IndexError:
×
610
        raise Exception('Putative templates list empty.')
×
611
    putative_templates = {x : putative_templates[x] for x in putative_templates 
1✔
612
                          if putative_templates[x][class_variables[2]] == max_score}
613

614
    # Find the putative template with the best matching peptide
615
    pos_list = []
1✔
616
    for ID in putative_templates:
1✔
617
        score = score_peptide_alignment(target, getattr(database, class_variables[1])[ID], substitution_matrix='PAM30')
1✔
618
        pos_list.append((score, getattr(database, class_variables[1])[ID].peptide, ID))
1✔
619

620
    if len(pos_list) == 0:
1✔
621
        raise Exception('Pandora could not find any putative template! Please try to define your own template or contact us for help')
×
622
    
623
    # Sort templates per peptide score
624
    template_id = [i[-1] for i in sorted(pos_list, key=lambda elem: elem[0], reverse=True)][:best_n_templates]
1✔
625
    scores = sorted(pos_list, key=lambda elem: elem[0], reverse=True)[:best_n_templates]
1✔
626

627
    templates = [getattr(database, class_variables[1])[tmpl] for tmpl in template_id]
1✔
628
    keep_IL = any(check_target_template(target, tmpl) for tmpl in templates)
1✔
629

630
    return templates, scores, keep_IL
1✔
631

632
def write_ini_script(target, template, alignment_file, output_dir, clip_C_domain=False):
1✔
633
    ''' Writes the MyLoop.py and cmd_modeller_ini.py files. This function takes two template python scripts and fills
634
        in the required information: Anchor positions for the MyLoop file and structure name + alignment file for the
635
        cmd_modeller_ini file.
636

637
    Args:
638
        target: Target object
639
        template: Template object
640
        alignment_file: (string) path to alignment file
641
        output_dir: (string) path to output directory
642

643
    '''
644

645
    anch = target.anchors
1✔
646

647
    if target.MHC_class == 'I':
1✔
648
        with open(output_dir+ '/MyLoop.py', 'w') as myloopscript:
1✔
649
            MyL_temp = open(PANDORA.PANDORA_path + '/Pandora/MyLoop_template.py', 'r')
1✔
650
            for line in MyL_temp:
1✔
651
                # Include or not B2M depending on clip_C_domain
652
                if '#RENAME SEGMENTS PLACEHOLDER' in line:
1✔
653
                    if not clip_C_domain:
1✔
654
                        myloopscript.write("        self.rename_segments(segment_ids=['M', 'B', 'P'], renumber_residues=[1, 1, 1])")
1✔
655
                    else:
656
                        myloopscript.write("        self.rename_segments(segment_ids=['M', 'P'], renumber_residues=[1, 1])")
×
657
                elif 'self.residue_range' in line and 'M.selection' in line:
1✔
658
                    myloopscript.write(line % (anch[0]+1, anch[-1]-1))
1✔
659
                elif 'SPECIAL_RESTRAINTS_BREAK' in line:
1✔
660
                    break
1✔
661
                elif 'contact_file = open' in line:
1✔
662
                    myloopscript.write(line %target.id)
×
663
                else:
664
                    myloopscript.write(line)
1✔
665
            MyL_temp.close()
1✔
666

667
    if target.MHC_class == 'II':
1✔
668
        with open(output_dir + '/MyLoop.py', 'w') as myloopscript:
1✔
669
            MyL_temp = open(PANDORA.PANDORA_path + '/Pandora/MyLoop_template_II.py', 'r')
1✔
670
            for line in MyL_temp:
1✔
671
                if 'self.residue_range' in line and 'M.selection' in line:
1✔
672
                    myloopscript.write("        return M.selection(self.residue_range('%i:P', '%i:P'))\n" %(1, len(target.peptide)))
1✔
673

674
                elif 'SPECIAL_RESTRAINTS_BREAK' in line:
1✔
675
                    break
1✔
676
                elif 'contact_file = open' in line:
1✔
677
                    myloopscript.write(line %target.id)
×
678
                else:
679
                    myloopscript.write(line)
1✔
680
            MyL_temp.close()
1✔
681

682
    with open(output_dir.replace('\\ ', ' ') + '/cmd_modeller_ini.py', 'w') as modscript:
1✔
683
        cmd_m_temp = open(PANDORA.PANDORA_path + '/Pandora/cmd_modeller_ini.py', 'r')
1✔
684
        for line in cmd_m_temp:
1✔
685
            if 'alnfile' in line:
1✔
686
                modscript.write(line % os.path.basename(alignment_file))
1✔
687
            elif 'knowns' in line:
1✔
688
                if type(template)==list:
1✔
689
                    modscript.write(
×
690
                        'knowns = (%s), sequence = "%s",\n' % (','.join(['"' + i.id + '"' for i in template]), target.id))
691
                else:
692
                    modscript.write(
1✔
693
                        'knowns = (%s), sequence = "%s",\n' % ('"' + template.id + '"', target.id))
694
                # modscript.write(line % ('(' + ','.join([i.id for i in template]) + ')', target.id))
695
            else:
696
                modscript.write(line)
1✔
697
        cmd_m_temp.close()
1✔
698

699

700
def write_modeller_script(target, template, alignment_file, output_dir, n_homology_models=1, n_loop_models = 20,
1✔
701
                          loop_refinement='slow', n_jobs=None, helix = False, sheet = False, 
702
                          restraints_stdev=False, clip_C_domain=False):
703
                          
704
    ''' Write script that refines the loops of the peptide
705
    
706
    Args:
707
        target (PANDORA.PMHC.PMHC.Target): Target object
708
        template (PANDORA.PMHC.PMHC.Template): Template object
709
        alignment_file (str): path to alignment file
710
        output_dir (str): path to output directory
711
        n_homology_models (int): number of homology models that are generated per run.
712
        n_loop_models (int): number of loop models modeller generates per homology model
713
        n_jobs (int): number of parallel jobs. Is recommended to use at most as many jobs as the number of models:
714
            ore will not add any benefit but might occupy cores unnecessarily.
715
        loop_refinement (str): Level of loop refinement: very_fast,fast,slow,very_slow,slow_large.
716
            Defaults to slow
717
        helix (list): List of the alpha helix start and end-positions as integers. I.e. [3,8] for a helix between
718
            peptide residue 3 and 8.
719
        sheet (list): List containing: start position of B-sheet 1, start position of B-sheet 2 and the length of the
720
            B-sheet in h-bonds. For example: ["O:2:P","N:54:M",2] for a parallel B-sheet; The sheet starts
721
            at the Oxigen atom of the 2nd residue of chain P and at the Nitrogen of the 54th residue of
722
            chain M and has a length of 2 H-bonds. Or; ["N:6:P", "O:13:P", -3], with -3 denoting an
723
            anti-parallel B-sheet with a length of 3 H-bonds.
724
        restraints_stdev (bool or float): if True, keeps the whole peptide flexible. Increases computational time by 30-50% 
725
            but increases accuracy. If float, it used as standard deviation of modelling restraints. Higher = more flexible restraints. 
726
            Defaults to False. Setting it to True only will set the default standard dev iation to 0.1.
727

728
    '''
729

730
    anch = target.anchors
1✔
731
    if type(restraints_stdev) == float:
1✔
732
        stdev = restraints_stdev
1✔
733
    else:
734
        stdev = 0.1
1✔
735

736
    if target.MHC_class == 'I':
1✔
737
        with open(output_dir.replace('\\ ', ' ') + '/MyLoop.py', 'w') as myloopscript:
1✔
738
            MyL_temp = open(PANDORA.PANDORA_path + '/Pandora/MyLoop_template.py', 'r')
1✔
739
            for line in MyL_temp:
1✔
740
                # Include or not B2M depending on clip_C_domain
741
                if '#RENAME SEGMENTS PLACEHOLDER' in line:
1✔
742
                    if not clip_C_domain:
1✔
743
                        myloopscript.write("        self.rename_segments(segment_ids=['M', 'B', 'P'], renumber_residues=[1, 1, 1])")
1✔
744
                    else:
745
                        myloopscript.write("        self.rename_segments(segment_ids=['M', 'P'], renumber_residues=[1, 1])")
×
746
                # Add flexible region selection range
747
                elif 'self.residue_range' in line and 'M.selection' in line:
1✔
748
                    if restraints_stdev:
1✔
749
                        myloopscript.write(line %(1, len(target.peptide)))  # write the first anchor
1✔
750
                    else:
751
                        myloopscript.write(line %(anch[0]+1, anch[-1]-1))
1✔
752
                # Add restraints standard deviation (only effective on non-fixed residues)
753
                elif 'STDEV MARKER' in line:
1✔
754
                    myloopscript.write(line %(stdev))
1✔
755
                # Add contact file name
756
                elif 'contact_file = open' in line:
1✔
757
                    myloopscript.write(line %(target.id))
1✔
758
                # Add Alpha helix restraints
759
                elif helix and 'ALPHA-HELIX-MARKER' in line:
1✔
760
                    myloopscript.write(line.replace('# ALPHA-HELIX-MARKER', 'rsr.add(M.secondary_structure.alpha(self.residue_range("%s:P", "%s:P")))' %(helix[0], helix[1])))
×
761
                # Add Beta sheet restraints
762
                elif sheet and 'BETA-SHEET-MARKER' in line:
1✔
763
                    myloopscript.write(line.replace('# BETA-SHEET-MARKER', 'rsr.add(M.secondary_structure.sheet(atoms["%s"], atoms["%s"], sheet_h_bonds=%s))' %(sheet[0], sheet[1], sheet[2])))
×
764
                else:
765
                    myloopscript.write(line)
1✔
766
            MyL_temp.close()
1✔
767

768
    if target.MHC_class == 'II':
1✔
769
        with open(output_dir.replace('\\ ', ' ') + '/MyLoop.py', 'w') as myloopscript:
1✔
770
            MyL_temp = open(PANDORA.PANDORA_path + '/Pandora/MyLoop_template_II.py', 'r')
1✔
771
            for line in MyL_temp:
1✔
772
                if 'ANCHORS_PLACEHOLDER' in line:
1✔
773
                    if anch[0] == 0:
1✔
774
                        anch_1 = 1
×
775
                    else:
776
                        anch_1 = anch[0]
1✔
777
                    if anch[-1] == (len(target.peptide)-1):
1✔
778
                        anch_term = len(target.peptide)
×
779
                    else:
780
                        anch_term = anch[-1]
1✔
781
                    #Write first and last anchors, to keep only the flanking regions flexible
782
                    if restraints_stdev:
1✔
783
                        #myloopscript.write(line % (1, len(target.peptide)))
784
                        myloopscript.write("        return M.selection(self.residue_range('%i:P', '%i:P'))\n" %(1, len(target.peptide)))
1✔
785
                    else:
786
                        myloopscript.write("        return M.selection(self.residue_range('%i:P', '%i:P'), self.residue_range('%i:P', '%i:P'))\n" % 
×
787
                                            (1, anch_1, anch_term, len(target.peptide)))
788
                        #self.residue_range('%i:P', '%i:P'), self.residue_range('%i:P', '%i:P')
789
                    #for i in range(len(anch)-1): # Write all the inbetween acnhors if they are there
790
                    #    myloopscript.write(line % (anch[i] + 2, anch[i+1]))
791
                    #myloopscript.write(line % (anch[-1] + 2, len(target.peptide))) # Write the last anchor
792
                elif 'contact_file = open' in line:
1✔
793
                    myloopscript.write(line %(target.id))
1✔
794
                elif 'STDEV MARKER' in line:
1✔
795
                    myloopscript.write(line %(stdev))
1✔
796
                elif helix and 'ALPHA-HELIX-MARKER' in line:
1✔
797
                    myloopscript.write(line.replace('# ALPHA-HELIX-MARKER', 'rsr.add(M.secondary_structure.alpha(self.residue_range("%s:P", "%s:P")))' %(helix[0], helix[1])))
×
798
                elif sheet and 'BETA-SHEET-MARKER' in line:
1✔
799
                    myloopscript.write(line.replace('# BETA-SHEET-MARKER', 'rsr.add(M.secondary_structure.sheet(atoms["%s"], atoms["%s"], sheet_h_bonds=%s))' %(sheet[0], sheet[1], sheet[2])))
×
800
                else:
801
                    myloopscript.write(line)
1✔
802
            MyL_temp.close()
1✔
803

804
    with open(output_dir.replace('\\ ', ' ') + '/cmd_modeller.py', 'w') as modscript:
1✔
805
        cmd_m_temp = open(PANDORA.PANDORA_path + '/Pandora/cmd_modeller_template.py', 'r')
1✔
806
        for line in cmd_m_temp:
1✔
807
            if 'alnfile' in line:
1✔
808
                modscript.write(line %(os.path.basename(alignment_file)))
1✔
809
            elif 'knowns' in line:
1✔
810
                if type(template)==list:
1✔
811
                    modscript.write(
×
812
                        'knowns = (%s), sequence = "%s",\n' % (','.join(['"' + i.id + '"' for i in template]), target.id))
813
                else:
814
                    modscript.write(
1✔
815
                        'knowns = (%s), sequence = "%s",\n' % ('"' + template.id + '"', target.id))
816
                # modscript.write(line %(','.join([i.id for i in template]), target.id))
817
            elif 'a.ending_model' in line:
1✔
818
                modscript.write(line % (n_homology_models))
1✔
819
            elif 'a.loop.ending_model' in line:
1✔
820
                modscript.write(line % (n_loop_models))
1✔
821
            elif 'a.loop.md_level' in line:
1✔
822
                modscript.write('a.loop.md_level = MA.refine.%s   # Loop model refinement level' %(loop_refinement))
1✔
823
            else:
824
                if n_jobs != None: #If this is a parallel job
1✔
825
                    if 'PARALLEL_JOB_LINE_TO_COMPLETE' in line:
×
826
                        modscript.write(line %(str(n_jobs))) #specify the number of cores
×
827
                    else:
828
                        modscript.write(line)  #Write the line as it is
×
829
                else: #If this is not a parallel job
830
                    if 'PARALLEL_JOB_LINE' in line: #do not write the lines requested for parallelization
1✔
831
                        pass
1✔
832
                    else:
833
                        modscript.write(line)  #Write the line as it is
1✔
834
        cmd_m_temp.close()
1✔
835

836

837

838
def run_modeller(output_dir, target, python_script = 'cmd_modeller.py', benchmark = False, pickle_out = True,
1✔
839
                 keep_IL = False, RMSD_atoms = ['C', 'CA', 'N', 'O']):
840
    ''' Perform the homology modelling.
841

842
    Args:
843
        output_dir: (string) path to output directory
844
        target: Target object
845
        python_script:  (string) path to script that performs the modeller modelling. cmd_modeller.py
846
        benchmark: (bool) Perform L-RMSD calculations? only works if the target id is an existing pdb id
847
        pickle_out: (bool) Save a .pkl with the results
848

849
    Returns: (list) of Model objects
850

851
    '''
852
    # Identify current working directory
853
    cwd = os.getcwd()
1✔
854

855
    # Change working directory
856
    os.chdir(output_dir)
1✔
857
    # run Modeller to perform homology modelling
858
    os.popen('python3 %s > modeller.log' %python_script).read()
1✔
859
    os.chdir(cwd)
1✔
860

861
    # Parse .log file
862
    logf = []
1✔
863
    f = open(output_dir + '/modeller.log')
1✔
864
    for line in f:
1✔
865
        if line.startswith(target.id + '.'):   #target.id
1✔
866
            l = line.split()
1✔
867
            if len(l) == 3: #TODO: make sure the line is reporting the model with tis score. Format: model, molpdf, dope.
1✔
868
                logf.append(tuple(l))
1✔
869
    f.close()
1✔
870

871
    # If keep_IL is true (happens if the target and template are the same), also use the initial model as one of the
872
    # results. This will also happen while benchmarking.
873
    if keep_IL:
1✔
874
        # Also take the Initial Loop model. Take the molpdf from the pdb header.
875
        il_file = [i for i in os.listdir(output_dir) if i.startswith(target.id + '.IL')][0]
×
876
        # Create a fake molpdf/dope score for the IL model: the best molpdf/dope from the real models - 1
877
        try:
×
878
            fake_molpdf = str(min(float(i[1]) for i in logf) - 1)
×
879
            fake_dope = str(min(float(i[2]) for i in logf) - 1)
×
880
        except ValueError:
×
881
            fake_molpdf = -10000
×
882
            fake_dope = -10000
×
883
            print('WARNING: ValueError exception raised while assigning fake molpdf and dope to IL model')
×
884
        # Append the filename and molpdf to the rest of the data
885
        logf.append((il_file, fake_molpdf, fake_dope))
×
886

887
    # Sort output by molpdf
888
    logf.sort(key=lambda tup:float(tup[1]))
1✔
889

890
    # Write to output file
891
    f = open(output_dir + '/molpdf_DOPE.tsv', 'w')
1✔
892
    for i in logf:
1✔
893
        f.write(i[0] + '\t' + i[1] + '\t' + i[2] + '\n')
1✔
894
    f.close()
1✔
895

896

897
    # Create Model object of each theoretical model and add it to results
898
    results = []
1✔
899
    for i in range(len(logf)):
1✔
900
        try:
1✔
901
            m = Model.Model(target, model_path=output_dir + '/' + logf[i][0], output_dir = output_dir,
1✔
902
                                            molpdf=logf[i][1], dope=logf[i][2])
903

904
        except:
×
905
            print('WARNING: Error raised while calling Model.Model() for case %s' %target.id)
×
906
            print(traceback.format_exc())
×
907
            m = None
×
908
        results.append(m)
1✔
909

910

911
    # Save results as pickle
912
    if pickle_out:
1✔
913
        pickle.dump(results, open("%s/results_%s.pkl" %(output_dir, os.path.basename(os.path.normpath(output_dir))), "wb"))
×
914

915
    return results
1✔
916

917
def blast_mhc_seq(seq, chain='M', blastdb=PANDORA.PANDORA_data + '/BLAST_databases/refseq_blast_db/refseq_blast_db'):
1✔
918
    try:
1✔
919
        command = (' ').join(['blastp','-db',blastdb, 
1✔
920
                                                 '-query',
921
                                                 '<(echo %s)' %seq,
922
                                                 '-outfmt','6'])
923
        proc = subprocess.Popen(command,  executable='/bin/bash',
1✔
924
                                     shell=True, stdout=subprocess.PIPE)
925
        blast_result = proc.stdout.read()
1✔
926
        blast_result = blast_result.decode()
1✔
927
    except subprocess.CalledProcessError as e:
×
928
        raise Exception('An error occurred while blasting %s chain seq: %s' %(chain, e.output))
×
929
    
930
    if not blast_result:
1✔
931
        raise Exception('An error occurred while blasting %s chain seq: blast output empty' %(chain))
×
932

933
    blast_result = blast_result.split('\n')
1✔
934
    blast_result = [x.replace(';',' ').split('\t') for x in blast_result]
1✔
935
    blast_result = [x for x in blast_result if x != ['']]
1✔
936
    
937
    #FIll in putative_templates M identity score
938
    results = []
1✔
939
    for result in blast_result:
1✔
940
        ID = result[1]
1✔
941
        score = float(result[2])
1✔
942
        results.append((ID, score))
1✔
943
    results = sorted(results, key=lambda x: x[1], reverse=True)
1✔
944
    
945
    return results
1✔
946
        
947
    
948

949
def align_peptides(seq1, anch1_seq1, anch2_seq1, seq2, anch1_seq2, anch2_seq2):
1✔
950
    '''
951
    Align two MHC-I peptides making overlap the anchors.
952
    This function does NOT use an alignment matrix (e.g. BLOSUM, PAM, etc).
953
    It computes a simple anchor position alignment and inserts gap in the
954
    middle part to make the final sequences have the same lenghts.
955

956
    Args:
957
        seq1(str) : sequence of the first peptide.
958
        anch1_seq1(int) : position of the first anchor of seq1. Position must be given in Python numbering (0-N)
959
        anch2_seq1(int) : position of the second anchor of seq1. Position must be given in Python numbering (0-N)
960
        seq2(str) : sequence of the second peptide.
961
        anch1_seq1(int) : position of the first anchor of seq1. Position must be given in Python numbering (0-N)
962
        anch2_seq1(int) : position of the second anchor of seq1. Position must be given in Python numbering (0-N)
963

964
    Returns:
965
        ali_seq1(str)
966
    '''
967

968
    seq1_core = anch2_seq1 - anch1_seq1
1✔
969
    seq2_core = anch2_seq2 - anch1_seq2
1✔
970
    tail1 = [x for x in seq1[anch2_seq1:]]
1✔
971
    tail2 = [x for x in seq2[anch1_seq2:]]
1✔
972

973
    list1 = [x for x in seq1]
1✔
974
    list2 = [x for x in seq2]
1✔
975
    #Adding gaps in cores
976
    if seq1_core > seq2_core:
1✔
977
        for x in range(seq1_core - seq2_core):
×
978
            list2.insert(int(len(seq2)/2), '-')
×
979
    elif seq1_core < seq2_core:
1✔
980
        for x in range(seq2_core - seq1_core):
×
981
            list1.insert(int(len(seq1)/2), '-')
×
982
    ### Adding gaps in heads
983
    if anch1_seq1 > anch1_seq2:
1✔
984
        for x in range(anch1_seq1 - anch1_seq2):
×
985
            list2.insert(0, '-')
×
986
    elif anch1_seq1 < anch1_seq2:
1✔
987
        for x in range(anch1_seq2 - anch1_seq1):
×
988
            list1.insert(0, '-')
×
989
    ### Adding gaps in heads
990
    if len(tail1) > len(tail2):
1✔
991
        for x in range(len(tail1) - len(tail2)):
×
992
            list2.insert(-1, '-')
×
993
    elif len(tail1) < len(tail2):
1✔
994
        for x in range(len(tail1) - len(tail2)):
1✔
995
            list1.insert(-1, '-')
×
996

997
    ali_seq1 = ('').join(list1)
1✔
998
    ali_seq2 = ('').join(list2)
1✔
999
    return ali_seq1, ali_seq2
1✔
1000

1001
def allele_name_adapter(MHC_class, ori_alleles, available_alleles):
1✔
1002
    '''
1003
    Cuts the given allele name to make it consistent with the alleles in allele_ID.
1004

1005
    Args:
1006
        allele(list) : Allele names
1007
        allele_ID(dict) : Dictionary of structure IDs (values) in the dataset for each allele (keys)
1008
        
1009
    Returns:
1010
        allele(list) : List of adapted (cut) allele names
1011
    '''
1012
    #homolog_allele = '--NONE--'
1013
    alleles = deepcopy(ori_alleles)
×
1014
    if MHC_class =='I':
×
1015
        for a in range(len(alleles)):
×
1016
            if alleles[a].startswith('HLA'):      # Human
×
1017
                if any(alleles[a] in key for key in list(available_alleles)):
×
1018
                    pass
×
1019
                elif any(alleles[a][:8] in key for key in list(available_alleles)):
×
1020
                    alleles[a] = alleles[a][:8]
×
1021
                elif any(alleles[a][:6] in key for key in list(available_alleles)):
×
1022
                    alleles[a] = alleles[a][:6]
×
1023
                else:
1024
                    alleles[a] = alleles[a][:4]
×
1025
            elif alleles[a].startswith('H2'):    # Mouse
×
1026
                #homolog_allele = 'RT1'
1027
                if any(alleles[a] in key for key in list(available_alleles)):
×
1028
                    pass
×
1029
                elif any(alleles[a][:4] in key for key in list(available_alleles)):
×
1030
                    alleles[a] = alleles[a][:4]
×
1031
                else:
1032
                    alleles[a] = alleles[a][:3]
×
1033
            elif alleles[a].startswith('RT1'):          # Rat
×
1034
                #homolog_allele = 'H2'
1035
                if any(alleles[a] in key for key in list(available_alleles)):
×
1036
                    pass
×
1037
                elif any(alleles[a][:5] in key for key in list(available_alleles)):
×
1038
                    alleles[a] = alleles[a][:5]
×
1039
                else:
1040
                    alleles[a] = alleles[a][:4]
×
1041
            elif alleles[a].startswith('BoLA'):        # Bovine
×
1042
                if any(alleles[a] in key for key in list(available_alleles)):
×
1043
                    pass
×
1044
                elif any(alleles[a][:10] in key for key in list(available_alleles)):
×
1045
                    alleles[a] = alleles[a][:10]
×
1046
                elif any(alleles[a][:7] in key for key in list(available_alleles)):
×
1047
                    alleles[a] = alleles[a][:7]
×
1048
                else:
1049
                    alleles[a] = alleles[a][:5]
×
1050
            elif alleles[a].startswith('SLA'):        # Suine
×
1051
                if any(alleles[a] in key for key in list(available_alleles)):
×
1052
                    pass
×
1053
                elif any(alleles[a][:9] in key for key in list(available_alleles)):
×
1054
                    alleles[a] = alleles[a][:9]
×
1055
                elif any(alleles[a][:6] in key for key in list(available_alleles)):
×
1056
                    alleles[a] = alleles[a][:6]
×
1057
                else:
1058
                    alleles[a] = alleles[a][:4]
×
1059
            elif alleles[a].startswith('MH1-B'):        # Chicken
×
1060
                if any(alleles[a] in key for key in list(available_alleles)):
×
1061
                    pass
×
1062
                elif any(alleles[a][:8] in key for key in list(available_alleles)):
×
1063
                    alleles[a] = alleles[a][:8]
×
1064
                else:
1065
                    alleles[a] = alleles[a][:6]
×
1066
            elif alleles[a].startswith('MH1-N'):        # Chicken
×
1067
                if any(alleles[a] in key for key in list(available_alleles)):
×
1068
                    pass
×
1069
                elif any(alleles[a][:9] in key for key in list(available_alleles)):
×
1070
                    alleles[a] = alleles[a][:9]
×
1071
                else:
1072
                    alleles[a] = alleles[a][:6]
×
1073
            elif alleles[a].startswith('BF2'):        # Chicken
×
1074
                if any(alleles[a] in key for key in list(available_alleles)):
×
1075
                    pass
×
1076
                elif any(alleles[a][:6] in key for key in list(available_alleles)):
×
1077
                    alleles[a] = alleles[a][:6]
×
1078
                else:
1079
                    alleles[a] = alleles[a][:4]
×
1080
            elif alleles[a].startswith('Mamu'):       # Monkey
×
1081
                if any(alleles[a] in key for key in list(available_alleles)):
×
1082
                    pass
×
1083
                elif any(alleles[a][:13] in key for key in list(available_alleles)):
×
1084
                    alleles[a] = alleles[a][:13]
×
1085
                elif any(alleles[a][:9] in key for key in list(available_alleles)):
×
1086
                    alleles[a] = alleles[a][:9]
×
1087
                else:
1088
                    alleles[a] = alleles[a][:5]
×
1089
            elif alleles[a].startswith('Eqca'):        # Horse
×
1090
                if any(alleles[a] in key for key in list(available_alleles)):
×
1091
                    pass
×
1092
                elif any(alleles[a][:10] in key for key in list(available_alleles)):
×
1093
                    alleles[a] = alleles[a][:10]
×
1094
                elif any(alleles[a][:7] in key for key in list(available_alleles)):
×
1095
                    alleles[a] = alleles[a][:7]
×
1096
                else:
1097
                    alleles[a] = alleles[a][:5]
×
1098
                    
1099
    elif MHC_class =='II':
×
1100
        for a in range(len(alleles)):
×
1101
            if alleles[a].startswith('HLA'):      # Human
×
1102
                prefix = alleles[a].split('-')[0]
×
1103
                gene = re.split('-|\*', alleles[a])[1][:2]
×
1104
                chain = re.split('-|\*', alleles[a])[1][2]
×
1105
                subgene = re.split('-|\*', alleles[a])[1][3:]
×
1106
                group = re.split(':|\*', alleles[a])[1]
×
1107
                subgroup = re.split(':|\*', alleles[a])[2]
×
1108
                
1109
                if any(alleles[a] in key for key in list(available_alleles)):
×
1110
                    pass
×
1111
                elif any(prefix+'-'+gene+chain+subgene+'*'+group in key for key in list(available_alleles)):
×
1112
                    print('WARNING: The provided allele subgroup has not been found. PANDORA will treat this case as %s' %(prefix+'-'+gene+chain+subgene+'*'+group))
×
1113
                    alleles[a] = prefix+'-'+gene+chain+subgene+'*'+group
×
1114
                elif any(prefix+'-'+gene+chain+subgene in key for key in list(available_alleles)):
×
1115
                    alleles[a] = prefix+'-'+gene+chain+subgene
×
1116
                    print('WARNING: The provided allele group has not been found. PANDORA will treat this case as %s' %(prefix+'-'+gene+chain+subgene))
×
1117
                else:
1118
                    alleles[a] = prefix+'-'+gene+chain
×
1119
            
1120
            else:                               #Other spieces might be implemented later
1121
                pass
×
1122
    return alleles #, homolog_allele)
×
1123
    
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc