• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl-variation / #612095329

24 Oct 2023 08:58AM UTC coverage: 83.243%. First build
#612095329

Pull #1048

travis-ci

Pull Request #1048: HGVS refseq - edit alleles (112)

25 of 25 new or added lines in 2 files covered. (100.0%)

22623 of 27177 relevant lines covered (83.24%)

1427.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.79
/modules/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm
1
=head1 LICENSE
2

3
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4
Copyright [2016-2025] EMBL-European Bioinformatics Institute
5

6
Licensed under the Apache License, Version 2.0 (the "License");
7
you may not use this file except in compliance with the License.
8
You may obtain a copy of the License at
9

10
     http://www.apache.org/licenses/LICENSE-2.0
11

12
Unless required by applicable law or agreed to in writing, software
13
distributed under the License is distributed on an "AS IS" BASIS,
14
WITHOUT WARRANTIES OR CONDITIONS OF ANY kind, either express or implied.
15
See the License for the specific language governing permissions and
16
limitations under the License.
17

18
=cut
19

20

21
=head1 CONTACT
22

23
 Please email comments or questions to the public Ensembl
24
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25

26
 Questions may also be sent to the Ensembl help desk at
27
 <http://www.ensembl.org/Help/Contact>.
28

29
=cut
30

31
=head1 NAME
32

33
Bio::EnsEMBL::Variation::TranscriptVariationAllele
34

35
=head1 SYNOPSIS
36

37
    use Bio::EnsEMBL::Variation::TranscriptVariationAllele;
38
    
39
    my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new(
40
        -transcript_variation   => $tv,
41
        -variation_feature_seq  => 'A',
42
        -is_reference           => 0,
43
    );
44

45
    print "sequence with respect to the transcript: ", $tva->feature_seq, "\n";
46
    print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n";
47
    print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n";
48
    print "amino acid change: ", $tva->pep_allele_string, "\n";
49
    print "resulting codon: ", $tva->codon, "\n";
50
    print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n";
51
    print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n";
52
    print "SIFT prediction: ", $tva->sift_prediction, "\n";
53

54
=head1 DESCRIPTION
55

56
A TranscriptVariationAllele object represents a single allele of a TranscriptVariation.
57
It provides methods that are specific to the sequence of the allele, such as codon,
58
peptide etc. Methods that depend only on position (e.g. CDS start) will be found in 
59
the associated TranscriptVariation. Ordinarily you will not create these objects 
60
yourself, but instead you would create a TranscriptVariation object which will then 
61
construct TranscriptVariationAlleles based on the allele string of the associated
62
VariationFeature. 
63

64
Note that any methods that are not specific to Transcripts will be found in the 
65
VariationFeatureOverlapAllele superclass.
66

67
=cut
68

69
package Bio::EnsEMBL::Variation::TranscriptVariationAllele;
70

71
use strict;
31✔
72
use warnings;
31✔
73

74
use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP);
31✔
75
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
31✔
76
use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string get_3prime_seq_offset);
31✔
77
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
31✔
78
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap within_cds within_intron stop_lost start_lost frameshift stop_retained);
31✔
79

80
use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele);
31✔
81

82

83
our $DEBUG = 0;
84
our $NO_TRANSFER = 0;
85

86
sub new_fast {
87
    my ($class, $hashref, $strong) = @_;
103✔
88
    
89
    # swap a transcript_variation argument for a variation_feature_overlap one
90
    if ($hashref->{transcript_variation}) {
103✔
91
        $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation};
86✔
92
    }
93

94
    # and call the superclass
95

96
    return $class->SUPER::new_fast($hashref, $strong);
103✔
97
}
98

99
=head2 _return_3prime
100

101
  Description: Shifts an insertion/deletion as far as possible in the 3' direction, relative
102
               to the transcript.
103
               Called in TranscriptVariation.pm for original mapping and in 
104
               TranscriptVariationAllele.pm if required for HGVS
105
  Exceptions : none
106

107
=cut
108

109
sub _return_3prime {
110
    
111
  ## Will create a "shift_hash", containing info on precisely how the variant should be shifted when required
112
  my $self = shift;
445✔
113
  my $hgvs_only = shift;
445✔
114
  
115
  ## Return if we have already calculated a shifting hash for this allele
116
  return if defined($self->{shift_hash}) || ($self->is_reference && !$hgvs_only);
445✔
117
  
118
  my $tv = $self->transcript_variation;
441✔
119
  my $vf = $tv->base_variation_feature;
441✔
120
  my $var_class = $vf->var_class;
441✔
121
  
122
  ## Don't even attempt shifting if it's not an insertion or deletion
123
  return unless ($var_class eq 'insertion' || $var_class eq 'deletion' );
441✔
124

125
  ## If the associated database adaptor has switched HGVS shifting off, don't shift anything
126
  return if (defined($vf->adaptor) && defined($vf->adaptor->db)) && ($vf->adaptor->db->shift_hgvs_variants_3prime() == 0);
199✔
127
  
128
  my $vf_allele_string = $vf->allele_string;
188✔
129
  return if $vf_allele_string =~ /INS|DEL|HGMD|COSM|CNV/i;
188✔
130

131
  my $tr = $tv->transcript; 
187✔
132
  $self->initialise_unshifted_values;
187✔
133
  
134
  ## Checks to see if a simliar shift hash already exists for this variation feature.
135
  ## If not, a genomic shift is performed
136
  my $hash = $self->check_tva_shifting_hashes;
187✔
137
  $self->{shift_hash} = $hash if(defined($hash));
187✔
138
  
139
  ## Performs a shift in either the 5' or 3' direction depending on the strand of the transcript
140
  if (!defined($self->{shift_hash}))
187✔
141
  {
142
    $self->_genomic_shift($tr->strand);
134✔
143
    $hash = $self->check_tva_shifting_hashes;
134✔
144
    $self->{shift_hash} = $hash if(defined($hash))
134✔
145
  }
146

147
  return unless (defined($tv->cdna_start) && defined($tv->cdna_end) && defined($tv->cds_start) && defined($tv->cds_end)) 
187✔
148
  || (defined($tv->cdna_start_unshifted) && defined($tv->cdna_end_unshifted) && defined($tv->cds_start_unshifted) && defined($tv->cds_end_unshifted));
149
  
150
  return if ($tr->stable_id =~ /ENS/);
163✔
151
  my @attribs = @{$tr->get_all_Attributes()};
×
152
  
153
  ## Checks to see if the underlying sequence has been edited
154
  my @edit_attrs = grep {$_->code =~ /^_rna_edit/ && !$self->is_polyA($_)} @attribs;
×
155
  
156
  ## Return unless the RefSeq transcript has edited sequence - we need to check flanking sequences
157
  return unless scalar(@edit_attrs);
×
158
  
159
  my $hgvs_notation;
×
160

161
  ## Get all shift hashes generated for the associated variation feature
162
  ## If the flanking sequences are the same for (shift_length + 1) bases, we can just copy
163
  ## that shift hash rather than calculating another one
164
  my @preshifted_hashes = @{$self->base_variation_feature->{tva_shift_hashes}};
×
165
  if(scalar(@preshifted_hashes))
×
166
  {
167
    my ($slice_start2, $slice_end2, $slice ) = $self->_var2transcript_slice_coords($tr, $tv, $vf);
×
168

169
    if(defined($slice))
×
170
    {
171
      ## Sort through each shift hash with a corresponding transcript slice 
172
      ## Get the flanking sequences from the slice object and compare to previous hashes
173
      foreach my $shifted_obj (@preshifted_hashes)
×
174
      { 
175
        my $start = $var_class eq 'insertion' ? $tv->cdna_end_unshifted : $tv->cdna_start_unshifted;
×
176
        my $end = $var_class eq 'insertion' ? $tv->cdna_start_unshifted : $tv->cdna_end_unshifted;
×
177

178
        my $seq_to_search = defined($tr->{_variation_effect_feature_cache}->{spliced_seq}) ? $tr->{_variation_effect_feature_cache}->{spliced_seq} : $tr->seq->seq;
×
179
        my $whole_seq = substr($seq_to_search, $start - $shifted_obj->{shift_length} - 2, $end - $start + 1 + (2 * ($shifted_obj->{shift_length} + 1)));
×
180
        reverse_comp(\$whole_seq) if $tr->strand != 1;
×
181
        
182
        if(($shifted_obj->{type} eq 'ins' && (length($whole_seq) != ((2 * ($shifted_obj->{shift_length} + 1))))) || 
×
183
          ($shifted_obj->{type} eq 'del' && (length($whole_seq) != ((2 * ($shifted_obj->{shift_length} + 1)) + length($shifted_obj->{allele_string})))) )
184
        {
185
          ## This happens when an insertion/deletion gets too close to the transcript boundary and shifting in either direction at transcript level becomes tricky.
186
          $self->{shift_hash} = $shifted_obj;
×
187
          return;
×
188
        }
189
        
190
        my $pre_substr = substr($whole_seq, 0 , 1 + $shifted_obj->{shift_length});
×
191
        my $post_substr = substr($whole_seq, , -1 - $shifted_obj->{shift_length});
×
192
        if ($pre_substr eq $shifted_obj->{five_prime_flanking_seq} && $post_substr eq $shifted_obj->{three_prime_flanking_seq})
×
193
        {
194
          ## If both flanking sequences match, just copy over the shift hash
195
          $self->{shift_hash} = $shifted_obj;
×
196
          return;
×
197
        }
198
        
199
      }
200
    }
201
    else { ## This vf does not lie within that feature
202
      $self->{shift_hash} = $vf->{shift_hash} if (defined($vf->{shift_hash}) && $tr->strand == 1);
×
203
      $self->{shift_hash} = $vf->{shift_hash_reverse} if (defined($vf->{shift_hash_reverse}) && $tr->strand == -1);
×
204

205
      return;
×
206
    }
207
  }
208
  
209
  ## If we have no matching shift hashes already calculated, then we'll have to generate it
210
  
211
  my $edited_transcript_seq = defined($tr->{_variation_effect_feature_cache}->{spliced_seq}) ? $tr->{_variation_effect_feature_cache}->{spliced_seq} : $tr->seq->seq;
×
212
  my $start = $var_class eq 'insertion' ? $tv->cdna_end_unshifted : $tv->cdna_start_unshifted;
×
213
  my $end = $var_class eq 'insertion' ? $tv->cdna_start_unshifted : $tv->cdna_end_unshifted;
×
214

215
  ## To prevent having to work with too large a sequence, we get transcript sequence, get sequence +/- 1000bp from variant location, and check how far it can shift.
216
  ## Could be tidied up into a 'Shift 50 bases and if it's longer than that then take a larger slice'
217
  my $area_to_search = 1000;
×
218
  
219
  ## Gets flanking sequences around the variant to test if shifting is possible
220
  my $search_start = ($start - $area_to_search - 1) < 0 ? 0 : ($start - $area_to_search - 1);
×
221
  my $search_end = ($end + $area_to_search > length($edited_transcript_seq)) ? length($edited_transcript_seq) : ($end + $area_to_search);
×
222
  my $seqs = substr($edited_transcript_seq, $search_start, $search_end - $search_start);
×
223
  my $pre_seq = substr($seqs, 0, $start - $search_start - 1); 
×
224
  my $post_seq = substr($seqs, 0 - ($search_end - $end));
×
225
  
226
  my $unshifted_allele_string = $self->allele_string;
×
227
  my @allele_string = split('/', $unshifted_allele_string);
×
228
  my $hgvs_output_string = $allele_string[1];
×
229
  
230
  
231
  ## isolate correct sequence to attempt to shift
232
  my $seq_to_check;
×
233
  my $type;
×
234
  if ($var_class eq 'deletion')
×
235
  {
236
    $seq_to_check = $allele_string[0];
×
237
    $type = 'del';
×
238
  }
239
  elsif ($var_class eq 'insertion') 
240
  {
241
    $seq_to_check = $allele_string[1];
×
242
    $type = 'ins';
×
243
  }
244
  
245
  my $shift_length;
×
246
  my $strand = $tr->strand;
×
247

248
  # vf can be on different strand than transcript
249
  my $vf_strand = $self->variation_feature->strand;
×
250
  reverse_comp(\$seq_to_check) if $vf_strand != $strand;
251
  
252
  ## Actually performs the shift, and provides raw data in order to create shifting hash
×
253
  ($shift_length, $seq_to_check, $hgvs_output_string, $start, $end) = $self->perform_shift($seq_to_check, $post_seq, $pre_seq, $start, $end, $hgvs_output_string, (-1 * ($strand -1))/2, $strand); 
254
  
×
255
  ## Creates shift_hash to attach to VF and TVA objects for 
256
  $self->create_shift_hash($seq_to_check, $post_seq, $pre_seq, $start, $end, $hgvs_output_string, $type, $shift_length, $strand, 0);
257

258
  return;
259
}
260

261
=head2 check_tva_shifting_hashes
262

263
  Description: Checks to see if a matching shifting hash already exists for another
264
               tva object attached to the same variation feature
265
  Returntype : hash
266
  Status     : At Risk
267

321✔
268
=cut
321✔
269

321✔
270
sub check_tva_shifting_hashes {
321✔
271
  my $self = shift;
272
  my $preshifted_hashs = $self->base_variation_feature->{tva_shift_hashes};  
187✔
273
  my @shift_hashes = grep {$_->{unshifted_allele_string} eq $self->allele_string  && $self->transcript->strand eq $_->{strand}} @{$preshifted_hashs} if defined($preshifted_hashs);
274
  if (scalar(@shift_hashes) == 1)
134✔
275
  {
276
    return $shift_hashes[0];
277
  }
278
  return undef;
279
}
280

281
=head2 perform_shift
282

283
  Description: Calculates the maximum possible shift in the 3' direction, provides the 
284
               shift length, shifted allele string, and new start/end coordinates
285
               Requires flanking sequences, allele string, strand and feature location
286
  Returntype : 5 scalars - shift length, shifted allele string, hgvs allele string, start position, end position
287
  Status     : At Risk
288

289
=cut
134✔
290

291
sub perform_shift {
134✔
292
  ## Performs the shifting calculation
134✔
293
  my ($self, $seq_to_check, $post_seq, $pre_seq, $var_start, $var_end, $hgvs_output_string, $reverse, $seq_strand) = @_;
294
  ## get length of pattern to check 
295
  my $indel_length = (length $seq_to_check);
134✔
296
  my $shift_length = 0;
134✔
297

298
  # hgvs_output_string can be on different strand than seq_to_check
134✔
299
  my $vf_strand = $self->variation_feature->strand;
300
  my $hgvs_reverse = ($vf_strand != $seq_strand);
174✔
301
  
174✔
302
  ## Sets up the for loop, ensuring that the correct bases are compared depending on the strand
174✔
303
  my $loop_limiter = $reverse ? (length($pre_seq) - $indel_length) + 1 : (length($post_seq) - $indel_length);
304
  $loop_limiter = length($post_seq) if $loop_limiter < 0;
174✔
305
  
306
  for (my $n = $reverse; $n <= $loop_limiter; $n++ ){
74✔
307
    ## check each position in deletion/ following seq for match
74✔
308
    my $check_next_del;
74✔
309
    my $check_next_pre;
310
    my $hgvs_next_del;
311
    
100✔
312
    if ($reverse)
100✔
313
    {
100✔
314
      $check_next_del  = substr($seq_to_check, length($seq_to_check) -1, 1);
315
      $check_next_pre = substr($pre_seq, length($pre_seq) - $n, 1);
174✔
316
      $hgvs_next_del  = $hgvs_reverse ? substr($hgvs_output_string, 0, 1) : substr($hgvs_output_string, length($hgvs_output_string) -1, 1);
317
    }
318
    else{
40✔
319
      $check_next_del  = substr($seq_to_check, 0, 1);
320
      $check_next_pre = substr($post_seq, $n, 1);
321
      $hgvs_next_del  = $hgvs_reverse ? substr($hgvs_output_string, length($hgvs_output_string) -1, 1) : substr($hgvs_output_string, 0, 1); 
40✔
322
    }
323
    if($check_next_del eq $check_next_pre){
29✔
324
      
29✔
325
      ## move position of deletion along
29✔
326
      $shift_length++;
29✔
327
      
328
      ## Reforms the sequences to check and the HGVS output strings for the next iteration
329
      if($reverse)
330
      {
11✔
331
        $seq_to_check = substr($seq_to_check, 0, length($seq_to_check) -1);
11✔
332
        $hgvs_output_string = $hgvs_reverse ? substr($hgvs_output_string,1) : substr($hgvs_output_string, 0, length($hgvs_output_string) -1);
11✔
333
        $seq_to_check = $check_next_del . $seq_to_check;  
11✔
334
        $hgvs_output_string = $hgvs_reverse ? $hgvs_output_string . $hgvs_next_del : $hgvs_next_del . $hgvs_output_string;
11✔
335
      }
11✔
336
      else
337
      {
338
        $seq_to_check = substr($seq_to_check,1);
339
        $hgvs_output_string = $hgvs_reverse ? substr($hgvs_output_string, 0, length($hgvs_output_string) -1) : substr($hgvs_output_string,1);
134✔
340
        $seq_to_check .= $check_next_del;
341
        $hgvs_output_string = $hgvs_reverse ? $hgvs_next_del . $hgvs_output_string : $hgvs_output_string . $hgvs_next_del;
342
        $var_start++;
343
        $var_end++;
134✔
344
      }
345
    }
346
    else{
347
      last;            
348
    }
349
  }
350
  
351
  return $shift_length, $seq_to_check, $hgvs_output_string, $var_start, $var_end;
352
}
353

354
=head2 create_shift_hash
355

356
  Description: Generates a hash to attach to the TVA object to store shifting info.
357
               Can contain shifting info for both directions for genes with 
358
               transcripts on both strands
359
  Returntype : hash
134✔
360
  Status     : Stable
134✔
361

134✔
362
=cut
134✔
363

364

134✔
365
sub create_shift_hash
134✔
366
{
134✔
367
  my ($self, $seq_to_check, $post_seq, $pre_seq, $var_start, $var_end, $hgvs_output_string, $type, $shift_length, $strand, $genomic) = @_;
368
  my $vf = $self->variation_feature;
369
  my $five_prime_flanking_seq = substr($pre_seq, -1 - $shift_length);
370
  my $three_prime_flanking_seq = substr($post_seq, 0, $shift_length + 1);
371
  
372
  my @allele_string = split('/', $self->allele_string);
373
  reverse_comp(\$seq_to_check) if $vf->strand <0; 
374
  my %shift_hash = (
375
    "genomic" => $genomic,
376
    "strand" => $strand,
377
    "shifted_allele_string"  => $seq_to_check,
378
    "unshifted_allele_string" => $self->allele_string, 
379
    "shift_length"  => $shift_length,
380
    "start" => $var_start,
381
    "end" => $var_end,
382
    "type" => $type,
383
    "unshifted_start" => $vf->seq_region_start,
384
    "unshifted_end" => $vf->seq_region_end,
385
    "hgvs_allele_string" => $hgvs_output_string,
134✔
386
    "ref_orig_allele_string" => $allele_string[0],
134✔
387
    "alt_orig_allele_string" => $allele_string[1],
134✔
388
    "_hgvs_offset" => $shift_length,
134✔
389
    "five_prime_flanking_seq" => $five_prime_flanking_seq,
134✔
390
    "three_prime_flanking_seq" => $three_prime_flanking_seq,
391
    "allele_string" => $self->allele_string, 
392
  );
134✔
393
    $self->{shift_hash} = \%shift_hash unless $genomic;
394
    $vf->{shift_hash} = \%shift_hash if ($strand == 1 && $genomic);
395
    $vf->{shift_hash_reverse} = \%shift_hash if ($strand == -1 && $genomic);
396
    $vf->{tva_shift_hashes} = [] unless defined($vf->{tva_shift_hashes});
397
    push @{$vf->{tva_shift_hashes}}, \%shift_hash;
398
    
399
    
400
    return %shift_hash;
401
}
402

403
=head2 _genomic_shift
404

405
  Description: Performs the shifting at genomic level - all that's required for EnsEMBL 
406
               transcripts. Shifts 3' on either strand
134✔
407
  Status     : Stable
134✔
408

134✔
409
=cut
134✔
410

134✔
411
sub _genomic_shift
412
{
134✔
413
  ## Does the initial shift at the genomic level, can be on either strand
414
  my $self = shift;
134✔
415
  my $strand = shift;
134✔
416
  my $tv = $self->transcript_variation;
417
  my $vf = $tv->variation_feature;
418
  my $var_class = $vf->var_class;
419

134✔
420
  return unless ($var_class eq 'insertion' || $var_class eq 'deletion' );
134✔
421
  
134✔
422
  my $slice_to_shrink = $vf->slice;
423
  my ($slice_start, $slice_end, $var_start, $var_end) = ($slice_to_shrink->start, $slice_to_shrink->end, $vf->seq_region_start, $vf->seq_region_end );
134✔
424
  
134✔
425
  ## Gets chr slice, gets sequence +/- 1000bp from variant location, and checks how far it can shift.
426
  ## Could be tidied up into a 'Shift 50 bases and if it's longer than that then take a larger slice'
427
  my $area_to_search = 1000;
134✔
428
  my $orig_start = $var_start;
134✔
429
  my $orig_end = $var_end;
134✔
430
  
431
  my $new_slice = $slice_to_shrink->expand(0 - ($var_start - $slice_start - $area_to_search), 0 - ($slice_end - $var_end - $area_to_search));
134✔
432
  $new_slice = $new_slice->constrain_to_seq_region();
134✔
433
  
134✔
434
  ## Gets flanking sequences around the variant to test if shifting is possible
134✔
435
  my $seqs = $new_slice->seq;
436
  my $pre_seq = substr($seqs, 0, $area_to_search); 
437
  my $post_seq = substr($seqs, 0 - $area_to_search);
134✔
438
  
134✔
439
  my $unshifted_allele_string = $self->allele_string;
134✔
440
  my @allele_string = split('/', $unshifted_allele_string);
441
  $allele_string[1] = '-' if (($self->is_reference) && (scalar(@allele_string) == 1));
56✔
442
  my $hgvs_output_string = $allele_string[1];
56✔
443
  
444
  ## isolate correct sequence to attempt to shift
445
  my $seq_to_check;
446
  my $type;
78✔
447
  if ($var_class eq 'deletion')
78✔
448
  {
449
    $seq_to_check = $allele_string[0];
450
    $type = 'del';
134✔
451
  }
134✔
452
  elsif ($var_class eq 'insertion') 
453
  {
454
    $seq_to_check = $allele_string[1];
134✔
455
    $type = 'ins';
456
  }
457
  
134✔
458
  my $shift_length;
459
  reverse_comp(\$seq_to_check) if $self->variation_feature->strand <0; 
460
  
461
  ## Actually performs the shift, and provides raw data in order to create shifting hash
462
  ($shift_length, $seq_to_check, $hgvs_output_string, $var_start, $var_end) = $self->perform_shift($seq_to_check, $post_seq, $pre_seq, $var_start, $var_end, $hgvs_output_string, (-1 * ($strand -1))/2, 1); 
463
  
464
  ## Creates shift_hash to attach to VF and TVA objects for 
465
  $self->create_shift_hash($seq_to_check, $post_seq, $pre_seq, $var_start, $var_end, $hgvs_output_string, $type, $shift_length, $strand, 1);
466
}
467

468
=head2 look_for_slice_start
206✔
469

470
  Description: Finds slice_start value if it can't be found in the _return_3prime method
206✔
471
  Status     : At Risk
206✔
472

206✔
473
=cut
474

206✔
475
sub look_for_slice_start {
476
  my $self = shift;
206✔
477
  
478
  my $tv = $self->transcript_variation;
479
  my $vf = $tv->base_variation_feature;
206✔
480
  my $tr = $tv->transcript;
206✔
481
  
206✔
482
  my ($slice_start, $slice_end, $slice ) = $self->_var2transcript_slice_coords($tr, $tv, $vf);
483
  ## set new HGVS string
206✔
484
  if($slice_start)
485
  {
486
    ## add cache of seq/ pos required by c, n and p 
487
    $self->{_slice_start} = $slice_start;
488
    $self->{_slice_end}   = $slice_end;
489
    $self->{_slice}       = $slice;
490
  }
491
  return; 
492
}
493

494
=head2 clear_shifting_variables
495

496
  Description: Clears shifting variables to allow for the correct values to be 
2✔
497
               used by the OutputFactory and future analysis (i.e. Plugins)
498
  Status     : At Risk
2✔
499

2✔
500
=cut
501

2✔
502
sub clear_shifting_variables {
2✔
503
  
2✔
504
  my $self = shift;
2✔
505
  
2✔
506
  my $tv = $self->transcript_variation;
2✔
507
  my $tr ||= $tv->transcript;
2✔
508

2✔
509
  delete($tv->{cds_end});
2✔
510
  delete($tv->{cds_start});
511
  delete($tv->{_cds_coords});
2✔
512
  delete($tv->{translation_start});
513
  delete($tv->{translation_end});
1✔
514
  delete($tv->{_translation_coords});
1✔
515
  delete($tv->{cdna_start});
1✔
516
  delete($tv->{cdna_end});
1✔
517
  delete($tv->{_cdna_coords});
1✔
518
  
1✔
519
  if(defined($self->{shift_hash}))
520
  {
521
    $tv->cds_start(undef, $tr->strand * $self->{shift_hash}->{shift_length});
522
    $tv->cds_end(undef, $tr->strand * $self->{shift_hash}->{shift_length});
523
    $tv->cdna_start(undef, $tr->strand * $self->{shift_hash}->{shift_length});
524
    $tv->cdna_end(undef, $tr->strand * $self->{shift_hash}->{shift_length});
525
    $tv->translation_start(undef, $tr->strand * $self->{shift_hash}->{shift_length});
526
    $tv->translation_end(undef, $tr->strand * $self->{shift_hash}->{shift_length});
527
  }
528
}
529

530

531
=head2 initialise_unshifted_values
532

533
  Description: Populates the unshifted values at the beginning before any shifting
187✔
534
               is done
535
  Status     : Stable
187✔
536

537
=cut
187✔
538

539
sub initialise_unshifted_values {
85✔
540
  
85✔
541
  my $self = shift;
542
  
187✔
543
  my $tv = $self->transcript_variation;
544
  
84✔
545
  unless(defined($tv->{cds_end_unshifted}))
84✔
546
  {  
547
    $tv->cds_start_unshifted();
187✔
548
    $tv->cds_end_unshifted();
549
  }
85✔
550
  unless(defined($tv->{cdna_end_unshifted}))
85✔
551
  {  
552
    $tv->cdna_start_unshifted();
553
    $tv->cdna_end_unshifted();
554
  }
555
  unless(defined($tv->{translation_end_unshifted}))
556
  {  
557
    $tv->translation_start_unshifted();
558
    $tv->translation_end_unshifted();
559
  }
560
}
561

562

563
=head2 transcript_variation
564

565
  Description: Get/set the associated TranscriptVariation
3,677✔
566
  Returntype : Bio::EnsEMBL::Variation::TranscriptVariation
3,677✔
567
  Exceptions : throws if the argument is the wrong type
3,677✔
568
  Status     : Stable
569

570
=cut
571

572
sub transcript_variation {
573
    my ($self, $tv) = @_;
574
    assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $Bio::EnsEMBL::Utils::Scalar::ASSERTIONS && $tv;
575
    return $self->variation_feature_overlap($tv);
576
}
577

578
=head2 variation_feature
579

580
  Description: Get the associated VariationFeature
1,167✔
581
  Returntype : Bio::EnsEMBL::Variation::VariationFeature
1,167✔
582
  Exceptions : none
583
  Status     : Stable
584

585
=cut
586

587
sub variation_feature {
588
    my $self = shift;
589
    return $self->transcript_variation->variation_feature;
590
}
591

592
=head2 affects_peptide
593

594
  Description: Check if this changes the resultant peptide sequence
595
  Returntype : boolean
10✔
596
  Exceptions : None
10✔
597
  Caller     : general
598
  Status     : At Risk
599

600
=cut
601

602
sub affects_peptide {
603
  my $self = shift;
604
  return scalar grep { $_->SO_term =~ /stop|missense|frameshift|inframe|initiator/ } @{$self->get_all_OverlapConsequences}; 
605
}
606

607
=head2 pep_allele_string
608

609
  Description: Return a '/' delimited string of the reference peptide and the 
610
               peptide resulting from this allele, or a single peptide if this
611
               allele does not change the peptide (e.g. because it is synonymous)
168✔
612
  Returntype : string or undef if this allele is not in the CDS
613
  Exceptions : none
168✔
614
  Status     : Stable
615

168✔
616
=cut
617

168✔
618
sub pep_allele_string {
619
    my ($self) = @_;
168✔
620

621
    my $pep = $self->peptide;
168✔
622
    
623
    return undef unless $pep;
624
    
625
    my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide;
626

627
    return undef unless $ref_pep;
628
    
629
    return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep;
630
}
631

632
=head2 codon_allele_string
633

634
  Description: Return a '/' delimited string of the reference codon and the 
635
               codon resulting from this allele 
1✔
636
  Returntype : string or undef if this allele is not in the CDS
637
  Exceptions : none
1✔
638
  Status     : Stable
639

1✔
640
=cut
641

1✔
642
sub codon_allele_string {
643
    my ($self) = @_;
1✔
644
    
645
    my $codon = $self->codon;
646
    
647
    return undef unless $codon;
648
    
649
    my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon;
650
    
651
    return $ref_codon.'/'.$codon;
652
}
653

654
=head2 display_codon_allele_string
655

656
  Description: Return a '/' delimited string of the reference display_codon and the 
657
               display_codon resulting from this allele. The display_codon identifies
658
               the nucleotides affected by this variant in UPPER CASE and other 
659
               nucleotides in lower case
129✔
660
  Returntype : string or undef if this allele is not in the CDS
661
  Exceptions : none
129✔
662
  Status     : At Risk
663

129✔
664
=cut
665

129✔
666
sub display_codon_allele_string {
129✔
667
    my ($self) = @_;
668
    
129✔
669
    my $display_codon = $self->display_codon;
670
    
129✔
671
    return undef unless $display_codon;
672
        
129✔
673
    my $ref_tva = $self->transcript_variation->get_reference_TranscriptVariationAllele;
674
    $ref_tva->{shift_hash} = $self->{shift_hash};
675
    
676
    my $ref_display_codon = $ref_tva->display_codon;
677

678
    return undef unless $ref_display_codon;
679
    
680
    return $ref_display_codon.'/'.$display_codon;
681
}
682

683
=head2 peptide
684

685
  Description: Return the amino acid sequence that this allele is predicted to result in
2,930✔
686
  Returntype : string or undef if this allele is not in the CDS or is a frameshift
687
  Exceptions : none
2,930✔
688
  Status     : Stable
689

2,930✔
690
=cut
691

1,018✔
692
sub peptide {
693
  my ($self, $peptide) = @_;
1,018✔
694

695
  $self->{peptide} = $peptide if $peptide;
1,007✔
696

697
  unless(exists($self->{peptide})) {
698

699
    $self->{peptide} = undef;
793✔
700

701
    return $self->{peptide} unless $self->seq_is_unambiguous_dna;
775✔
702

703
    if(my $codon = $self->codon) {
704

775✔
705
      # the codon method can set the peptide in some circumstances 
706
      # so check here before we try an (expensive) translation
707
      return $self->{peptide} if $self->{peptide};
775✔
708

775✔
709
      my $tv = $self->base_variation_feature_overlap;
553✔
710

711
      # for mithocondrial dna we need to to use a different codon table
712
      my $codon_table = $tv->_codon_table;
713

714
      # check the cache
715
      my $pep_cache = $main::_VEP_CACHE->{pep}->{$codon_table} ||= {};
716
      if(!($self->{is_reference} && scalar @{$tv->_seq_edits}) && ($self->{peptide} = $pep_cache->{$codon})) {
222✔
717
        return $self->{peptide};
222✔
718
      }
719
      
222✔
720
      # translate the codon sequence to establish the peptide allele
721
      
222✔
722
      # allow for partial codons - split the sequence into whole and partial
209✔
723
      # e.g. AAAGG split into AAA and GG            
724
      my $whole_codon   = substr($codon, 0, int(length($codon) / 3) * 3);
725
      my $partial_codon = substr($codon, int(length($codon) / 3) * 3);
726

727
      my $pep = '';
728
      
209✔
729
      if($whole_codon) {          
730
          my $codon_seq = Bio::Seq->new(
731
            -seq        => $whole_codon,
732
            -moltype    => 'dna',
222✔
733
            -alphabet   => 'dna',
734
            );
222✔
735
          
89✔
736
          $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq;
737
        }
89✔
738

739
      # apply any seq edits?
740
      my $have_edits = 0;
13✔
741

13✔
742
      if($self->{is_reference}) {
743
        my $seq_edits = $tv->_seq_edits;
744
        
13✔
745
        if(scalar @$seq_edits) {
3✔
746

3✔
747
          # get TV coords, switch if necessary
3✔
748
          my ($tv_start, $tv_end) = ($tv->translation_start, $tv->translation_end);
749
          ($tv_start, $tv_end) = ($tv_end, $tv_start) if $tv_start > $tv_end;
750
          
3✔
751
          # get all overlapping seqEdits
752
          SE: foreach my $se(grep {overlap($tv_start, $tv_end, $_->start, $_->end)} @$seq_edits) {
753
            my ($se_start, $se_end, $alt) = ($se->start, $se->end, $se->alt_seq);
754
            my $se_alt_seq_length = length($alt);
755
            $have_edits = 1;
3✔
756
            
×
757
            # loop over each overlapping pos
758
            foreach my $tv_pos(grep {overlap($_, $_, $se_start, $se_end)} ($tv_start..$tv_end)) {
759

760
              # in some cases, the sequence edit can shorten the protein
3✔
761
              # this means our TV can fall outside the range of the edited protein
762
              # therefore for safety jump out
763
              if($tv_pos - $se_start >= $se_alt_seq_length) {
764
                return $self->{peptide} = undef;
765
              }
766

222✔
767
              # apply edit, adjusting for string position
27✔
768
              substr($pep, $tv_pos - $tv_start, 1) = substr($alt, $tv_pos - $se_start, 1);
769
            }
770
          }
222✔
771
        }
772
      }
222✔
773
      
774
      if($partial_codon && $pep ne '*') {
222✔
775
        $pep .= 'X';
776
      }
777

778
      $pep ||= '-';
2,348✔
779

780
      $pep_cache->{$codon} = $pep if length($codon) <= 3 && !$have_edits;
781

782
      $self->{peptide} = $pep;
783
    }
784
  }
785

786
  return $self->{peptide};
787
}
788

789
=head2 codon
790

791
  Description: Return the codon sequence that this allele is predicted to result in
2,098✔
792
  Returntype : string or undef if this allele is not in the CDS or is a frameshift
2,098✔
793
  Exceptions : none
794
  Status     : Stable
2,098✔
795

796
=cut
1,026✔
797

798
sub codon {
1,026✔
799
  my ($self, $codon) = @_;
800
  $self->{codon} = $codon if defined $codon;
1,026✔
801

1,026✔
802
  unless(exists($self->{codon})) {
803

1,026✔
804
    $self->{codon} = undef;
805
  
1,026✔
806
    my $tv = $self->base_variation_feature_overlap;
807
    
1,026✔
808
    my $shifting_offset = 0;
219✔
809
    my $tr = $tv->transcript;
810
  
811
    $shifting_offset = (defined($self->{shift_hash}) && defined($self->{shift_hash}->{shift_length})) ? $self->{shift_hash}->{shift_length} : 0;
812

807✔
813
    my ($tv_tr_start, $tv_tr_end) = ($tv->translation_start, $tv->translation_end);
807✔
814

815
    unless($tv_tr_start && $tv_tr_end && $self->seq_is_dna) {
807✔
816
      return $self->{codon};
807✔
817
    }
807✔
818

819
    # try to calculate the codon sequence
807✔
820
    $self->shift_feature_seqs unless $shifting_offset == 0;
×
821
    my $seq = $self->feature_seq;
×
822
    $seq = '' if $seq eq '-';
×
823

824
    my $ref = $tv->get_reference_TranscriptVariationAllele;
×
825

×
826
    # Mismatches between refseq transcripts and the reference genome are tracked in transcript attributes
827
    my @edit_attrs = grep {$_->code =~ /^_rna_edit/} @{$tr->get_all_Attributes()};
828

×
829
    # check if the refseq ref allele is the same as the alt allele
×
830
    # this is an invalid variant - throw warning
831
    if(!$self->{is_reference} && scalar @edit_attrs > 0) {
832
      my $refseq_ref = $edit_attrs[0]->value;
833
      $refseq_ref =~ s/\d+\s\d+\s//;
807✔
834
      if ($refseq_ref eq $seq) {
835
        $self->{invalid_alleles} = 1;
836
      }
837
    }
807✔
838

807✔
839
    # calculate necessary coords and lengths
807✔
840
    
807✔
841
    my $codon_cds_start = $tv_tr_start * 3 - 2;
842
    my $codon_cds_end   = $tv_tr_end * 3;
12✔
843
    my $codon_len       = $codon_cds_end - $codon_cds_start + 1;
12✔
844
    unless($shifting_offset == 0)
12✔
845
    {
846
      delete($tv->{cds_end});
807✔
847
      delete($tv->{cds_start});
805✔
848
      delete($tv->{_cds_coords});
805✔
849
    }
805✔
850
    return $self->{codon} if !defined($tv->cds_end(undef, $tr->strand * $shifting_offset)) || !defined($tv->cds_start(undef, $tr->strand * $shifting_offset));
851
    my $vf_nt_len       = $tv->cds_end(undef, $tr->strand * $shifting_offset) - $tv->cds_start(undef, $tr->strand * $shifting_offset) + 1;
10✔
852
    my $allele_len      = $self->seq_length;
10✔
853
    unless($shifting_offset == 0)
10✔
854
    {
855
      delete($tv->{cds_end});
805✔
856
      delete($tv->{cds_start});
805✔
857
      delete($tv->{_cds_coords});
858
    }
859
    my $cds;
860

861
    # if (abs($allele_len - $vf_nt_len) % 3)
862
    # This is frameshift variation
863
    # The resulting codon/peptide changed needs to be calculated
864
   
865
    ## Bioperl Seq object
866
    my $cds_obj = $self->_get_alternate_cds();
111✔
867
    return undef unless defined($cds_obj);
111✔
868
    $cds = ( $self->{is_reference} ? $tv->_translateable_seq() : $cds_obj->seq() );
111✔
869

870
    # modifies the $cds at the specific positions
871
    # this is necessary for RefSeq transcripts that have edited alleles saved in @edit_attrs
872
    if($self->{is_reference} && scalar @edit_attrs > 0) {
873
      substr($cds, $tv->cds_start(undef, $tr->strand * $shifting_offset) -1, $vf_nt_len) = $seq;
694✔
874
    }
875

694✔
876
    # and extract the codon sequence
877
    my $codon = ( $self->{is_reference} ? substr($cds, $codon_cds_start-1, $codon_len ) : substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len)));
878
    if (length($codon) < 1) {
879
      $self->{codon}   = '-';
805✔
880
      $self->{peptide} = '-';
881
    }
805✔
882
    else {
26✔
883
       $self->{codon} = $codon;
26✔
884
    }
885
  }
886

779✔
887
  return $self->{codon};
888
}
889

890
=head2 display_codon
1,877✔
891

892
  Description: Return the codon sequence that this allele is predicted to result in
893
               with the affected nucleotides identified in UPPER CASE and other 
894
               nucleotides in lower case
895
  Returntype : string or undef if this allele is not in the CDS or is a frameshift
896
  Exceptions : none
897
  Status     : Stable
898

899
=cut
900

901
sub display_codon {
902
  my $self = shift;
903

904
  unless(exists($self->{_display_codon})) {
905

637✔
906
    # initialise so it doesn't get called again
907
    $self->{_display_codon} = undef;
637✔
908

909
    if(my $codon = $self->codon) {
910

573✔
911
      my $display_codon = lc $self->codon;
912

573✔
913
      if(my $codon_pos = $self->transcript_variation->codon_position) {
914

353✔
915
        # if this allele is an indel then just return all lowercase
916
        if ($self->feature_seq ne '-') {
353✔
917
            
918
          # codon_position is 1-based, while substr assumes the string starts at 0          
919
          my $pos = $codon_pos - 1;
353✔
920

921
          my $len = length $self->feature_seq;
922

302✔
923
          substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len);
924
        }
302✔
925
      }
926

302✔
927
      $self->{_display_codon} = $display_codon;
928
    }
929
  }
930

353✔
931
  return $self->{_display_codon};
932
}
933

934
=head2 polyphen_prediction
637✔
935

936
  Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele.
937
               (Note that we currently only have PolyPhen predictions for variants that 
938
               result in single amino acid substitutions in human)
939
  Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown')
940
               if this is a missense change and a prediction is available, undef
941
               otherwise
942
  Exceptions : none
943
  Status     : At Risk
944

945
=cut
946

947
sub polyphen_prediction {
948
    my ($self, $classifier, $polyphen_prediction) = @_;
949
    
950
    $classifier ||= 'humvar';
951
    
19✔
952
    my $analysis = "polyphen_${classifier}";
953
    
19✔
954
    $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction;
955
    
19✔
956
    unless (defined $self->{$analysis}->{prediction}) {
957
        my ($prediction, $score) = $self->_protein_function_prediction($analysis);
19✔
958
        $self->{$analysis}->{score} = $score;
959
        $self->{$analysis}->{prediction} = $prediction;
19✔
960
    }
7✔
961
    
7✔
962
    return $self->{$analysis}->{prediction};
7✔
963
}
964

965
=head2 polyphen_score
19✔
966

967
  Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we 
968
               currently only have PolyPhen predictions for variants that result in single 
969
               amino acid substitutions in human)
970
  Returntype : float between 0 and 1 if this is a missense change and a prediction is 
971
               available, undef otherwise
972
  Exceptions : none
973
  Status     : At Risk
974

975
=cut
976

977
sub polyphen_score {
978
    my ($self, $classifier, $polyphen_score) = @_;
979
    
980
    $classifier ||= 'humvar';
981

7✔
982
    my $analysis = "polyphen_${classifier}";
983
    
7✔
984
    $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score;
985

7✔
986
    unless (defined $self->{$analysis}->{score}) {
987
        my ($prediction, $score) = $self->_protein_function_prediction($analysis);
7✔
988
        $self->{$analysis}->{score} = $score;
989
        $self->{$analysis}->{prediction} = $prediction;
7✔
990
    }
×
991

×
992
    return $self->{$analysis}->{score};
×
993
}
994

995
=head2 sift_prediction
7✔
996

997
  Description: Return the qualitative SIFT prediction for the effect of this allele.
998
               (Note that we currently only have SIFT predictions for variants that 
999
               result in single amino acid substitutions in human)
1000
  Returntype : string (one of 'tolerated', 'deleterious') if this is a missense 
1001
               change and a prediction is available, undef otherwise
1002
  Exceptions : none
1003
  Status     : At Risk
1004

1005
=cut
1006

1007
sub sift_prediction {
1008
    my ($self, $sift_prediction) = @_;
1009
    
1010
    $self->{sift_prediction} = $sift_prediction if $sift_prediction;
1011
    
23✔
1012
    unless (defined $self->{sift_prediction}) {
1013
        my ($prediction, $score) = $self->_protein_function_prediction('sift');
23✔
1014
        $self->{sift_score} = $score;
1015
        $self->{sift_prediction} = $prediction unless $self->{sift_prediction};
23✔
1016
    }
8✔
1017
    
8✔
1018
    return $self->{sift_prediction};
8✔
1019
}
1020

1021
=head2 sift_score
23✔
1022

1023
  Description: Return the SIFT score for this allele (Note that we currently only have SIFT 
1024
               predictions for variants that result in single amino acid substitutions in human)
1025
  Returntype : float between 0 and 1 if this is a missense change and a prediction is 
1026
               available, undef otherwise
1027
  Exceptions : none
1028
  Status     : At Risk
1029

1030
=cut
1031

1032
sub sift_score {
1033
    my ($self, $sift_score) = @_;
1034

1035
    $self->{sift_score} = $sift_score if defined $sift_score;
1036

8✔
1037
    unless (defined $self->{sift_score}) {
1038
        my ($prediction, $score) = $self->_protein_function_prediction('sift');
8✔
1039
        $self->{sift_score} = $score;
1040
        $self->{sift_prediction} = $prediction;
8✔
1041
    }
×
1042

×
1043
    return $self->{sift_score};
×
1044
}
1045

1046
=head2 cadd_prediction
8✔
1047

1048
  Description: Return the qualitative CADD prediction for the effect of this allele.
1049
               (Note that we currently only have predictions for variants that 
1050
               result in single amino acid substitutions in human)
1051
  Returntype : string (one of 'likely benign', 'likely deleterious') if this is a missense 
1052
               change and a prediction is available, undef otherwise. Predictions
1053
               are assigned based on CADD PHRED scores. CADD PHRED scores greater or
1054
               equal to 15 are considered likely deleterious.  
1055
  Exceptions : none
1056
  Status     : At Risk
1057

1058
=cut
1059

1060
sub cadd_prediction {
1061
  my ($self, $cadd_prediction) = @_;
1062
  return $self->_prediction('cadd_prediction', $cadd_prediction);
1063
}
1064

1✔
1065
=head2 dbnsfp_revel_prediction
1✔
1066

1067
  Description: Return the qualitative REVEL prediction for the effect of this allele.
1068
               (Note that we currently only have predictions for variants that 
1069
               result in single amino acid substitutions in human)
1070
  Returntype : string (one of 'likely_disease_causing', 'likely_not_disease_causing')
1071
               if this is a missense change and a prediction is available, undef otherwise.
1072
               We chose 0.5 as the threshold to assign the predictions. From the REVEL paper:
1073
               For example, 75.4% of disease mutations but only 10.9% of neutral variants
1074
               have a REVEL score above 0.5, corresponding to a sensitivity of 0.754 and
1075
               specificity of 0.891. 
1076
  Exceptions : none
1077
  Status     : At Risk
1078

1079
=cut
1080

1081
sub dbnsfp_revel_prediction {
1082
  my ($self, $dbnsfp_revel_prediction) = @_;
1083
  return $self->_prediction('dbnsfp_revel_prediction', $dbnsfp_revel_prediction);
1084
}
1085

1✔
1086
=head2 dbnsfp_meta_lr_prediction
1✔
1087

1088
  Description: Return the qualitative MetaLR prediction for the effect of this allele.
1089
               (Note that we currently only have predictions for variants that 
1090
               result in single amino acid substitutions in human)
1091
  Returntype : string (one of 'tolerated', 'damaging').
1092
               The score cutoff between "D" and "T" is 0.5.
1093
  Exceptions : none
1094
  Status     : At Risk
1095

1096
=cut
1097

1098
sub dbnsfp_meta_lr_prediction {
1099
  my ($self, $dbnsfp_meta_lr_prediction) = @_;
1100
  return $self->_prediction('dbnsfp_meta_lr_prediction', $dbnsfp_meta_lr_prediction);
1101
}
1102

1✔
1103
=head2 dbnsfp_mutation_assessor_prediction
1✔
1104

1105
  Description: Return the qualitative MutationAssessor prediction for the effect of this allele.
1106
               (Note that we currently only have predictions for variants that 
1107
               result in single amino acid substitutions in human)
1108
  Returntype : string (one of 'high', 'medium', 'low', 'neutral').
1109
  Exceptions : none
1110
  Status     : At Risk
1111

1112
=cut
1113

1114
sub dbnsfp_mutation_assessor_prediction {
1115
  my ($self, $dbnsfp_mutation_assessor_prediction) = @_;
1116
  return $self->_prediction('dbnsfp_mutation_assessor_prediction', $dbnsfp_mutation_assessor_prediction);
1117
}
1118

1✔
1119
=head2 _prediction
1✔
1120

1121
  Description: Return prediction for specified score type.
1122
  Returntype : float 
1123
  Exceptions : none
1124
  Status     : At Risk
1125

1126
=cut
1127

1128
sub _prediction {
1129
  my ($self, $prediction_type, $prediction) = @_;
1130
  $self->{$prediction_type} = $prediction if $prediction;
1131

1132
  unless (defined $self->{$prediction_type}) {
4✔
1133
    my $analysis = $prediction_type;
4✔
1134
    $analysis =~ s/_prediction//;
1135
    my ($prediction, $score) = $self->_protein_function_prediction($analysis);
4✔
1136
    my $score_type = $prediction_type;
4✔
1137
    $score_type =~ s/_prediction/_score/;
4✔
1138
    $self->{$score_type} = $score;
4✔
1139
    $self->{$prediction_type} = $prediction;
4✔
1140
  }
4✔
1141
  return $self->{$prediction_type};
4✔
1142
}
4✔
1143

1144
=head2 cadd_score
4✔
1145

1146
  Description: Return the CADD PHRED score for this allele.
1147
  Returntype : float if this is a missense change and a prediction is available, undef otherwise
1148
  Exceptions : none
1149
  Status     : At Risk
1150

1151
=cut
1152

1153
sub cadd_score {
1154
  my ($self, $cadd_score) = @_;
1155
  return $self->_score('cadd_score');
1156
}
1157

1✔
1158
=head2 dbnsfp_revel_score
1✔
1159

1160
  Description: Return the REVEL score for this allele. The score is retrieved from dbNSFP. (We only
1161
               have predictions for variants that result in single amino acid substitutions in human)
1162
  Returntype : float between 0 and 1 if this is a missense change and a prediction is 
1163
               available, undef otherwise
1164
  Exceptions : none
1165
  Status     : At Risk
1166

1167
=cut
1168

1169
sub dbnsfp_revel_score {
1170
  my ($self, $dbnsfp_revel_score) = @_;
1171
  return $self->_score('dbnsfp_revel_score');
1172
}
1173

1✔
1174
=head2 dbnsfp_meta_lr_score
1✔
1175

1176
  Description: Return the MetaLR score for this allele. The score is retrieved from dbNSFP. (We only
1177
               have predictions for variants that result in single amino acid substitutions in human)
1178
  Returntype : float if this is a missense change and a prediction is 
1179
               available, undef otherwise
1180
  Exceptions : none
1181
  Status     : At Risk
1182

1183
=cut
1184

1185
sub dbnsfp_meta_lr_score {
1186
  my ($self, $dbnsfp_meta_lr_score) = @_;
1187
  return $self->_score('dbnsfp_meta_lr_score', $dbnsfp_meta_lr_score);
1188
}
1189

1✔
1190
=head2 dbnsfp_mutation_assessor_score
1✔
1191

1192
  Description: Return the MutationAssessor score for this allele. The score is retrieved from dbNSFP. (We only
1193
               have predictions for variants that result in single amino acid substitutions in human)
1194
  Returntype : float if this is a missense change and a prediction is 
1195
               available, undef otherwise
1196
  Exceptions : none
1197
  Status     : At Risk
1198

1199
=cut
1200

1201
sub dbnsfp_mutation_assessor_score {
1202
  my ($self, $dbnsfp_mutation_assessor_score) = @_;
1203
  return $self->_score('dbnsfp_mutation_assessor_score', $dbnsfp_mutation_assessor_score);
1204
}
1205

1✔
1206

1✔
1207
=head2 _score
1208

1209
  Description: Return score for specified score type.
1210
  Returntype : float 
1211
  Exceptions : none
1212
  Status     : At Risk
1213

1214
=cut
1215

1216
sub _score {
1217
  my ($self, $score_type, $score) = @_; 
1218
  $self->{$score_type} = $score if defined $score;
1219

1220
  unless (defined $self->{$score_type}) {
4✔
1221
      my $analysis = $score_type;
4✔
1222
      $analysis =~ s/_score//;
1223
      my ($prediction, $score) = $self->_protein_function_prediction($analysis);
4✔
1224
      my $prediction_type = $score_type;
×
1225
      $prediction_type =~ s/_score/_prediction/;
×
1226
      $self->{$score_type} = $score;
×
1227
      $self->{$prediction_type} = $prediction;
×
1228
  }
×
1229
  return $self->{$score_type};
×
1230
}
×
1231

1232
sub _protein_function_prediction {
4✔
1233
    my ($self, $analysis) = @_;
1234

1235
    # we can only get results for variants that cause a single amino acid substitution, 
1236
    # so check the peptide allele string first
19✔
1237

1238
    if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) {
1239
        if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) {
1240
          
1241
            # temporary fix - check $matrix is not an empty hashref
19✔
1242
            if(ref($matrix) && ref($matrix) eq 'Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix') {
19✔
1243
            
1244
                my ($prediction, $score) = $matrix->get_prediction(
1245
                    $self->transcript_variation->translation_start,
19✔
1246
                    $self->peptide,
1247
                );
19✔
1248

1249
                return wantarray ? ($prediction, $score) : $prediction;
1250
            }
1251
        }
1252
    }
19✔
1253
    
1254
    return undef;
1255
}
1256

1257
=head2 shift_feature_seqs
×
1258

1259
  Description: Updates the feature_seq and variation_feature_seq values associated with this TVA to be shifted
1260
  Returntype : string 
1261
  Exceptions : none
1262
  Status     : At Risk
1263

1264
=cut
1265

1266
sub shift_feature_seqs {
1267
    my $self = shift;
1268
    if(defined($self->{shift_hash}) && !$self->{shifted_feature_seqs})
1269
    {
1270
      my $vf_seq = $self->{variation_feature_seq};
17✔
1271
      my $f_seq = $self->{feature_seq};
17✔
1272
      
1273
      my $shift_length = $self->{shift_hash}->{shift_length} ||= 0;
7✔
1274
      $shift_length = $self->seq_length - $shift_length if $self->transcript->strand == -1;
7✔
1275
      for (my $n = 0; $n < $shift_length; $n++ ){
1276
        ## check each position in deletion/ following seq for match
7✔
1277
        $vf_seq = substr($vf_seq, 1) . substr($vf_seq, 0, 1) if defined($vf_seq);
7✔
1278
        $f_seq = substr($f_seq, 1) . substr($f_seq, 0, 1) if defined($f_seq);
7✔
1279
      } 
1280
      $self->variation_feature_seq($vf_seq);
2✔
1281
      $self->feature_seq($f_seq);
2✔
1282
      $self->{shifted_feature_seqs} = 1;
1283
    }   
7✔
1284
}
7✔
1285

7✔
1286

1287
=head2 hgvs_genomic
1288

1289
  Description: Return a string representing the genomic-level effect of this allele in HGVS format
1290
  Returntype : string 
1291
  Exceptions : none
1292
  Status     : At Risk
1293

1294
=cut
1295

1296
sub hgvs_genomic {
1297
    return _hgvs_generic(@_,'genomic');
1298
}
1299

1300

6✔
1301
=head2 hgvs_transcript
1302

1303
  Description: Return a string representing the CDS-level effect of this allele in HGVS format
1304
  Returntype : string or undef if this allele is not in the CDS
1305
  Exceptions : none
1306
  Status     : At Risk
1307

1308
=cut
1309
    
1310
    
1311
sub hgvs_transcript {
1312
  my $self = shift;
1313
  my $notation = shift;
1314
  my $no_shift = shift;
1315
  
327✔
1316
  ##### set if string supplied
327✔
1317
  $self->{hgvs_transcript} = $notation   if defined $notation;
327✔
1318

1319
  ##### return if held 
1320
  return $self->{hgvs_transcript}        if defined $self->{hgvs_transcript};
327✔
1321

1322
  ## This is cached to allow long form HGVS to be created
1323
  ## Set as undefined here to avoid re-calling if hgvs_transcript annotation not possible
327✔
1324
  $self->{hgvs_t_ref} = undef;
1325

1326
  my $tv = $self->base_variation_feature_overlap;
1327

206✔
1328
  ### don't attempt to recalculate if field is NULL from DB
1329
  return $self->{hgvs_transcript}        if exists $self->{hgvs_transcript} && defined($tv->dbID);
206✔
1330

1331

1332
  ### don't try to handle odd characters
206✔
1333
  return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig;
1334

1335
  ### no result for reference allele
1336
  return undef if $self->is_reference == 1;
206✔
1337

1338
  ### else evaluate
1339
  my $tr = $tv->transcript;
206✔
1340
  my $tr_stable_id = $tr->stable_id;
1341
  my $vf = $tv->base_variation_feature;
1342
    
206✔
1343
  ### get reference sequence strand
206✔
1344
  my $refseq_strand = $tr->strand();
206✔
1345

1346
  my $var_name = $vf->variation_name();
1347

206✔
1348
  if($DEBUG ==1){    
1349
          print "\nHGVS transcript: Checking ";
206✔
1350
          print " var_name $var_name " if defined $var_name ;
1351
          print " refseq strand => $refseq_strand"  if defined $refseq_strand;
206✔
1352
          print " seq name : " . $tr_stable_id  
×
1353
              if defined $tr_stable_id ;
×
1354
          print " var strand " . $vf->strand()  
×
1355
              if defined $vf->strand();
×
1356
          print " vf st " . $vf->strand()   
1357
                  if defined $vf->strand()  ;
×
1358
          print " seqname: " . $vf->seqname()  ." seq: " . $self->variation_feature_seq  
1359
              if defined  $vf->seqname();
×
1360
          print "\n";
1361
  }
×
1362

1363
  my $hgvs_notation ; ### store components of HGVS string in hash
×
1364

1365
  my $variation_feature_sequence;
1366
  my $adaptor_shifting_flag = 1;
206✔
1367
  ## Check previous shift_hgvs_variants_3prime flag and act accordingly
1368
  if (defined($vf->adaptor) && defined($vf->adaptor->db)) {
206✔
1369
    $adaptor_shifting_flag = $vf->adaptor->db->shift_hgvs_variants_3prime();
206✔
1370
  }
1371
  elsif(defined($Bio::EnsEMBL::Variation::DBSQL::DBAdaptor::DEFAULT_SHIFT_HGVS_VARIANTS_3PRIME)){
206✔
1372
    $adaptor_shifting_flag = $Bio::EnsEMBL::Variation::DBSQL::DBAdaptor::DEFAULT_SHIFT_HGVS_VARIANTS_3PRIME;
196✔
1373
  }
1374
  
1375
  my $hash_already_defined = defined($self->{shift_hash}); 
1✔
1376
  ## Perform HGVS shift even if no_shift is on
1377
  $self->_return_3prime(1) unless ($adaptor_shifting_flag == 0);
1378

206✔
1379
  $variation_feature_sequence = $self->variation_feature_seq();
1380
  $variation_feature_sequence = $self->{shift_hash}->{hgvs_allele_string} if defined($self->{shift_hash}) && $vf->var_class() eq 'insertion' && ($adaptor_shifting_flag != 0);
206✔
1381
 
1382
  my $offset_to_add = defined($self->{shift_hash}) ? $self->{shift_hash}->{_hgvs_offset} : 0;# + ($no_shift ? 0 : (0 - $self->{_hgvs_offset}) );
206✔
1383
  $self->{_hgvs_offset} = $offset_to_add;
206✔
1384
  ## delete the shifting hash if we generated it for HGVS calculations
1385
  delete($self->{shift_hash}) unless $hash_already_defined;
206✔
1386
 
206✔
1387
  ## return if a new transcript_variation_allele is not available - variation outside transcript
1388
  return undef unless defined $self->base_variation_feature_overlap;
206✔
1389
  $self->look_for_slice_start unless (defined  $self->{_slice_start});
1390
  
1391
  unless (defined  $self->{_slice_start} ){
206✔
1392
          print "Exiting hgvs_transcript: no slice start position for $var_name in trans" . $tr_stable_id . "\n " if $DEBUG == 1 ;
206✔
1393
          return undef;
1394
  }
206✔
1395
  ## this may be different to the input one for insertions/deletions
×
1396
    print "vfs: $variation_feature_sequence &  $self->{_slice_start} -> $self->{_slice_end}\n" if $DEBUG ==1;
×
1397

1398
  my $lookup_order = 1;
1399
  if($variation_feature_sequence && $vf->strand != $refseq_strand) {
206✔
1400
    reverse_comp(\$variation_feature_sequence);
206✔
1401
    $lookup_order = -1 if $adaptor_shifting_flag == 0;
53✔
1402
  };
53✔
1403
  ## delete consequences if we have an offset. This is only in here for when we want HGVS to shift but not consequences.
1404
  ## TODO add no_shift flag test
1405
  delete($self->{_predicate_cache}) if $self->transcript_variation->{shifted} && $offset_to_add != 0; 
1406
  print "sending alt: $variation_feature_sequence &  $self->{_slice_start} -> $self->{_slice_end} for formatting\n" if $DEBUG ==1;
1407

×
1408
  return undef if (($self->{_slice}->end - $self->{_slice}->start + 1) < ($self->{_slice_end} + $offset_to_add));
1409
  #return undef if (length($self->{_slice}->seq()) < ($self->{_slice_end} + $offset_to_add));
1410
  $hgvs_notation = hgvs_variant_notation(
1411
    $variation_feature_sequence,    ### alt_allele,
1412
    $self->{_slice}->seq(),                             ### using this to extract ref allele
1413
    $self->{_slice_start} + $offset_to_add,
206✔
1414
    $self->{_slice_end} + $offset_to_add,
206✔
1415
    "",
1416
    "",
206✔
1417
    $var_name,
1418
    $lookup_order
1419
  );
206✔
1420

206✔
1421
  ### This should not happen
1422
  unless($hgvs_notation->{'type'}){
1423
    #warn "Error - not continuing; no HGVS annotation\n";
1424
    return undef;
1425
  } 
1426

1427
  ## check for the same bases in ref and alt strings before or after the variant
1428
  $hgvs_notation = _clip_alleles($hgvs_notation) unless $hgvs_notation->{'type'} eq 'dup';
1429

1430
  print "hgvs transcript type : " . $hgvs_notation->{'type'} . "\n" if $DEBUG == 1;    
1431
  print "Got type: " . $hgvs_notation->{'type'} ." $hgvs_notation->{'ref'} -> $hgvs_notation->{'alt'}\n" if $DEBUG == 1;
1432

206✔
1433
  ### create reference name - transcript name & seq version
1434
  my $stable_id = $tr_stable_id;
×
1435
  $stable_id .= "." . $tr->version() 
1436
     unless (!defined $tr->version() || $stable_id =~ /\.\d+$/ || $stable_id =~ /LRG/); ## no version required for LRG's
1437
  $hgvs_notation->{'ref_name'} = $stable_id;
1438

206✔
1439
  ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups]
1440
  # avoid doing this twice if start and end are the same
206✔
1441
  my $same_pos = $hgvs_notation->{start} == $hgvs_notation->{end};
206✔
1442
  
1443
  ## Mismatches between refseq transcripts and the reference genome are tracked in transcript attributes
1444
  my @edit_attrs = grep {$_->code =~ /^_rna_edit/} @{$tr->get_all_Attributes()};
206✔
1445

206✔
1446
  if(scalar @edit_attrs > 0) {
1447
    my $ref = $tv->get_reference_TranscriptVariationAllele;
206✔
1448
    $hgvs_notation->{ref} = $ref->feature_seq; # ref allele relative to the feature (transcript)
1449
    $hgvs_notation->{alt} = $self->feature_seq; # alt allele relative to the feature (transcript)
1450
  }
1451

206✔
1452
  my $misalignment_offset = 0;
1453
  $misalignment_offset = $self->get_misalignment_offset(\@edit_attrs) if (scalar(@edit_attrs) && (substr($tr->stable_id, 0,3) eq 'NM_' || substr($tr->stable_id, 0,3) eq 'XM_'));
1454
  
206✔
1455
  if ($vf->var_class eq 'SNP' && defined($self->{pre_consequence_predicates}) && $self->{pre_consequence_predicates}->{exon} && defined($tv->cds_start) && defined($tv->cds_end)) {
206✔
1456
    $hgvs_notation->{start} = $tv->cds_start;
1457
    $hgvs_notation->{end}   = $hgvs_notation->{start};
206✔
1458
  }
1459
  else{
206✔
1460
    $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} + $misalignment_offset);
1✔
1461
    $hgvs_notation->{end}   = $same_pos ? $hgvs_notation->{start} : $self->_get_cDNA_position( $hgvs_notation->{end} + $misalignment_offset );
1✔
1462
  }
1463
  return undef unless defined  $hgvs_notation->{start}  && defined  $hgvs_notation->{end} ;
1464

1✔
1465
  # Make sure that start is always less than end
1✔
1466
  my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
1✔
1467
  my ($exon_end_coord,   $intron_end_offset)   = $same_pos ? ($exon_start_coord, $intron_start_offset) : $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
1468
  $intron_start_offset ||= 0;
1✔
1469
  $intron_end_offset   ||= 0;
×
1470
  print "pre pos sort : $hgvs_notation->{start},$hgvs_notation->{end}\n" if $DEBUG ==1;
×
1471
  ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if (
×
1472
    (($exon_start_coord  > $exon_end_coord) || ($exon_start_coord  == $exon_end_coord && $intron_start_offset > $intron_end_offset) ) &&
1473
    $hgvs_notation->{end} !~/\*/
1474
  );
1✔
1475
  print "post pos sort: $hgvs_notation->{start},$hgvs_notation->{end}\n" if $DEBUG ==1;
1✔
1476

1477
  # save these to be able to report exon coordinates and intron distances
1478
  $self->{_hgvs_exon_start_coordinate} = $exon_start_coord;
1479
  $self->{_hgvs_intron_start_offset} = $intron_start_offset;
1480
  $self->{_hgvs_exon_end_coordinate} = $exon_end_coord;
206✔
1481
  $self->{_hgvs_intron_end_offset} = $intron_end_offset;
206✔
1482

1483

206✔
1484
  if($tr->cdna_coding_start()){
13✔
1485
    $hgvs_notation->{'numbering'} = "c";  ### set 'c' if transcript is coding 
13✔
1486
  }    
1487
  else{
1488
    $hgvs_notation->{'numbering'} = "n";  ### set 'n' if transcript non-coding 
193✔
1489
  }
193✔
1490

1491
  ### generic formatting 
206✔
1492
  print "pre-format $hgvs_notation->{alt}\n" if $DEBUG ==1;
1493

1494
  $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation);
206✔
1495
  if($DEBUG ==1){ print "HGVS notation: " . $self->{hgvs_transcript} . " \n"; }
206✔
1496

206✔
1497
  ## save the HGVS style reference sequence in case the other long form of HGVS is required
206✔
1498
  $self->{hgvs_t_ref} = ($hgvs_notation->{type} eq 'dup') ? $hgvs_notation->{alt} :  $hgvs_notation->{ref};
206✔
1499

206✔
1500
  return $self->{hgvs_transcript}; 
1501
}
1502

1503
=head2 is_polyA
206✔
1504

1505
  Description: Tests RefSeq transcript attribute to see if it's a poly-A tail
1506
  Returntype : boolean value
206✔
1507
  Status     : Stable
206✔
1508

206✔
1509
=cut
206✔
1510

1511
sub is_polyA {
1512
  my $self = shift;
206✔
1513
  my $attr = shift;
192✔
1514
  my @spl_value = split(/ /, $attr->{value});
1515
  if((scalar(@spl_value) eq 3) && (length($self->transcript->seq->seq) <= 1 + (length($spl_value[2]) + $spl_value[0])))
1516
  {
14✔
1517
      return 1;
1518
  }
1519
  return 0;
1520
}
206✔
1521

1522
=head2 get_misalignment_offset
206✔
1523

206✔
1524
  Description: Calculate total offset created by RefSeq bam alignment 
1525
  Returntype : scalar value
1526
  Status     : Stable
206✔
1527

1528
=cut
206✔
1529

1530
sub get_misalignment_offset {
1531
  my $self = shift;
1532
  my $attrs = shift;
1533
  my $mlength = 0;
1534
  
1535
  ## For each refseq edit transcript attribute given, isolate the insertions and deletions found 
1536
  ## before the given variant and sum their lengths
1537
  foreach my $attr (@{$attrs})
1538
  {
1539
    next if defined($attr->{description}) && $attr->description =~ /op=X/;
1540
    ## Find out whether insertion or deletion, and get length  
×
1541
    my @split_val = split(/ /,$attr->{value});
×
1542
    next if (scalar(@split_val) eq 3) && ($split_val[1] - $split_val[0] + 1) eq length($split_val[2]);
×
1543
    my $type = (scalar(@split_val) eq 3) ? 'ins' : 'del';
×
1544
    my $var_location_start = $self->transcript_variation->cdna_start;
1545
    $var_location_start = 0 unless defined($var_location_start);
×
1546
    next if $var_location_start < $split_val[0];
1547

×
1548
    if ($type eq 'del')
1549
    {
1550
      $mlength += -1 - ($split_val[1] - $split_val[0]);
1551
    }
1552
    else{
1553
      $mlength += length($split_val[2]);
1554
    }
1555
  }
1556
   
1557
  $self->{refseq_misalignment_offset} = $mlength;
1558
  return $mlength;
1559
  
2✔
1560
}
2✔
1561

2✔
1562

1563
=head2 hgvs_transcript_reference
1564

1565
  Description: Return a string representing the reference sequence as could be used in HGVS notation
2✔
1566
               Useful for deletions where the recommended format formerly used the reference sequence
1567
               but no longer does.
2✔
1568
  Returntype : string or undef if no reference allele is available.
1569
  Exceptions : none
2✔
1570
  Status     : At Risk
2✔
1571

2✔
1572
=cut
2✔
1573

2✔
1574
sub hgvs_transcript_reference {
2✔
1575

1576
  my $self = shift;
2✔
1577

1578
  $self->hgvs_transcript() unless exists $self->{hgvs_t_ref};
×
1579
  return $self->{hgvs_t_ref};
1580

1581
}
2✔
1582

1583

1584
=head2 hgvs_protein
1585

2✔
1586
  Description: Return a string representing the protein-level effect of this allele in HGVS format
2✔
1587
  Returntype : string or undef if this allele is not in the CDS 
1588
  Exceptions : none
1589
  Status     : At Risk
1590
  
1591
=cut
1592

1593
sub hgvs_protein {
1594
  my $self     = shift;
1595
  my $notation = shift;
1596
  my $prediction_format = shift;
1597
  my $convert_to_three_letter = shift;
1598
  my $hgvs_notation; 
1599

1600
  if($DEBUG == 1){
1601
    print "\nStarting hgvs_protein with "; 
1602
    print " var: " . $self->transcript_variation->variation_feature->variation_name() 
1603
      if defined $self->transcript_variation->variation_feature->variation_name() ;
1604
    print   " trans: " . $self->transcript_variation->transcript->display_id() 
46✔
1605
      if defined $self->transcript_variation->transcript->display_id() ;
1606
    print "\n";
46✔
1607
  }
46✔
1608

1609
  ### set if string supplied
1610
  $self->{hgvs_protein} = $notation  if defined $notation;
1611

1612

1613
  ### return if set
1614
  return $self->{hgvs_protein}       if defined $self->{hgvs_protein} ;
1615
  
1616
  ### don't attempt to recalculate if field is NULL from DB
1617
  return $self->{hgvs_protein}       if exists $self->{hgvs_protein} && defined($self->transcript_variation->dbID);
1618
  
1619
  ### don't try to handle odd characters
1620
  return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig;
1621

1622
  ### no HGVS annotation for reference allele
136✔
1623
  return undef if $self->is_reference();
136✔
1624
  print "checking pos with hgvs prot\n" if $DEBUG ==1;
136✔
1625

136✔
1626
  ## HGVS requires the variant be located in as 3' position as possible
136✔
1627

1628
  ## return if a new transcript_variation_allele is not available - variation outside transcript
136✔
1629
  my $tv = $self->base_variation_feature_overlap;
×
1630
  return undef unless defined $tv;
×
1631

1632
  my $vf = $tv->base_variation_feature;
×
1633
  my $tr          = $tv->transcript;
1634

×
1635
  my $adaptor_shifting_flag = 1;
1636
  ## Check previous shift_hgvs_variants_3prime flag and act accordingly
1637
  if (defined($vf->adaptor) && defined($vf->adaptor->db)) {
1638
    $adaptor_shifting_flag = $vf->adaptor->db->shift_hgvs_variants_3prime();
136✔
1639
  }
1640
  elsif(defined($Bio::EnsEMBL::Variation::DBSQL::DBAdaptor::DEFAULT_SHIFT_HGVS_VARIANTS_3PRIME)){
1641
    $adaptor_shifting_flag = $Bio::EnsEMBL::Variation::DBSQL::DBAdaptor::DEFAULT_SHIFT_HGVS_VARIANTS_3PRIME;
1642
  }
136✔
1643
  
1644
  ## Check to see if the shift_hash is already defined, allowing us to remove it from associated $tva objects when we only want to shift HGVS
1645
  my $hash_already_defined = defined($self->{shift_hash});
135✔
1646
  ## Perform HGVS shift even if no_shift is on - only prevent shifting if shift_hgvs_variants_3prime() has been switched off.
1647
  $self->_return_3prime(1) unless ($adaptor_shifting_flag == 0);
1648

135✔
1649

1650
  my $pre         = $self->_pre_consequence_predicates;
1651
  my $shifting_offset = 0;
135✔
1652
  if($tr->strand() > 0) {
135✔
1653
    $shifting_offset = (defined($self->{shift_hash}) && defined($self->{shift_hash}->{shift_length})) ? $self->{shift_hash}->{shift_length} : 0;
1654
  }
1655
  elsif($tr->strand < 0) {
1656
    $shifting_offset = (defined($self->{shift_hash}) && defined($self->{shift_hash}->{shift_length})) ? 0 - $self->{shift_hash}->{shift_length} : 0;
1657
  }
135✔
1658

135✔
1659
  ### no HGVS protein annotation for variants outside translated region 
1660
  if(defined($self->{shift_hash}) && defined($self->{shift_hash}->{shift_length}) && $self->{shift_hash}->{shift_length} != 0)
135✔
1661
  {
135✔
1662
    delete($tv->{translation_start});
1663
    delete($tv->{translation_end});
135✔
1664
    delete($tv->{_translation_coords});
1665
  }
135✔
1666
  unless (
126✔
1667
    ($pre->{coding}) &&
1668
    $tv->translation_start(undef, $shifting_offset) && 
1669
    $tv->translation_end(undef, $shifting_offset)
×
1670
  ){
1671
    print "Exiting hgvs_protein - variant " . $vf->variation_name() . "not within translation\n"  if $DEBUG == 1;
1672
    delete($self->{shift_hash}) unless $hash_already_defined;
1673
    return undef;
135✔
1674
  }
1675
       
135✔
1676
  print "proceeding with hgvs prot\n" if $DEBUG == 1;
1677
  print "Checking translation start: " . $tv->translation_start() ."\n" if $DEBUG == 1;
1678

135✔
1679
  ## checks complete - start building term
135✔
1680

135✔
1681
  ### get reference sequence
70✔
1682
  ### this is a temporary fix
1683
  if($tr->stable_id =~ /^ENS|^LRG/ || !defined($tr->analysis) || (defined($tr->analysis) && !defined($tr->analysis->db()))){
1684
    $hgvs_notation->{ref_name} = $tr->translation->display_id();
65✔
1685
  }
1686
  else{
1687
    ### get RefSeq identifiers
1688
    my @entries = grep {$_->{dbname} eq 'GenBank'} @{$tr->translation->get_all_DBEntries};
135✔
1689
    if(scalar @entries == 1){
1690
      $hgvs_notation->{ref_name} = $entries[0]->{primary_id};
6✔
1691
    }
6✔
1692
  }
6✔
1693

1694
  # Add seq version unless LRG 
135✔
1695
  $hgvs_notation->{ref_name} .= "." . $tr->translation->version() 
1696
    unless (!defined $tr->translation->version() || $hgvs_notation->{ref_name}=~ /\.\d+$/ || $hgvs_notation->{ref_name} =~ /LRG/ || $self->{remove_hgvsp_version});
1697

1698
  $hgvs_notation->{'numbering'} = 'p';
1699

2✔
1700
  ### get default reference location [changed later in some cases eg. duplication]
2✔
1701
  $hgvs_notation->{start}   = $tv->translation_start();
2✔
1702
  $hgvs_notation->{end}     = $tv->translation_end();  
1703

1704
  my $ref = $tv->get_reference_TranscriptVariationAllele;
133✔
1705

133✔
1706
  ## Incase the user wants shifted HGVS but not shifted consequences, we run the shifting method  
1707
  my $ref_hash_already_defined = defined($ref->{shift_hash});
1708
  $ref->_return_3prime(1) unless $ref_hash_already_defined;
1709
  ## get default reference & alt peptides  [changed later to hgvs format]
1710
  if(defined($self->{shift_hash}) && defined($self->{shift_hash}->{shift_length})  && $self->{shift_hash}->{shift_length} != 0) {
1711
    delete($self->{peptide});
133✔
1712
    delete($self->{codon});
133✔
1713
    delete($self->{feature_seq});
1714
    $self->shift_feature_seqs();
1715

1716
    if(defined($ref->{shift_hash})) {
×
1717
      delete($ref->{peptide});
×
1718
      delete($ref->{codon});
×
1719
      delete($ref->{feature_seq});
1720

1721
      $ref->{variation_feature_seq} = $self->{shift_hash}->{ref_orig_allele_string};
1722
      $ref->{variation_feature_seq} = $self->{shift_hash}->{shifted_allele_string} if $vf->var_class eq 'deletion';
1723
      $ref->{shifted_feature_seqs} = 1;
133✔
1724
    }
1725
  }
1726
  
133✔
1727
  $hgvs_notation->{alt} = $self->peptide;
1728

1729
  $hgvs_notation->{ref} = $ref->peptide; 
133✔
1730
  
133✔
1731
  ## delete the shifting hash if we generated it for HGVS calculations
1732
  delete($self->{shift_hash}) unless $hash_already_defined;
133✔
1733
  delete($ref->{shift_hash}) unless $ref_hash_already_defined;
1734

1735
  return undef unless $hgvs_notation->{ref};   
133✔
1736
  print "Got protein peps: $hgvs_notation->{ref} =>  $hgvs_notation->{alt} (" . $self->codon() .")\n" if $DEBUG ==1;
133✔
1737

1738
  if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref} &&
133✔
1739
    ($hgvs_notation->{alt} ne  $hgvs_notation->{ref})){
5✔
1740
    $hgvs_notation = _clip_alleles( $hgvs_notation);
5✔
1741
  }
5✔
1742

5✔
1743
  #### define type - types are different for protein numbering
1744
  $hgvs_notation  = $self->_get_hgvs_protein_type($hgvs_notation);
5✔
1745
  return undef unless defined $hgvs_notation->{type}; 
5✔
1746

5✔
1747
  $convert_to_three_letter = 1 unless defined $convert_to_three_letter;
5✔
1748
  ##### Convert ref & alt peptides taking into account HGVS rules
1749
  $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation, $convert_to_three_letter);
5✔
1750

5✔
1751
  unless($hgvs_notation) {
5✔
1752
    $self->{hgvs_protein} = undef;
1753
    return undef;
1754
  }
1755

133✔
1756
  ##### String formatting
1757
  return $self->_get_hgvs_protein_format($hgvs_notation, $prediction_format, $convert_to_three_letter);
133✔
1758
}
1759

1760

133✔
1761
=head2 hgvs_offset
133✔
1762

1763
  Description: Return the number of bases the variant was shifted 3'
133✔
1764
               to defined the HGVS transcript annotation 
133✔
1765
  Returntype : int or undef if HGVS has not been calculated or shift not applied
1766
  Exceptions : none
133✔
1767
  Status     : At risk
1768

112✔
1769
=cut
1770

1771
sub hgvs_offset {
1772
  my $self = shift;
133✔
1773
  #_hgvs_offset can usually be taken directly from the shift hash, however in situations where we remove the shift_hash from the $tva object after calculating HGVS then we can access it from $self->{_hgvs_offset}
133✔
1774
  return defined($self->{shift_hash}) ? $self->{shift_hash}->{_hgvs_offset} : $self->{_hgvs_offset};
1775
}
1776

133✔
1777
=head2 hgvs_exon_start_coordinate
133✔
1778

×
1779
  Description: Return the HGVS exon start coordinate
×
1780
  Returntype : int or undef if HGVS has not been calculated
1781
  Exceptions : none
1782
  Status     : At risk
1783

133✔
1784
=cut
1785

1786
sub hgvs_exon_start_coordinate {
1787
  my $self = shift;
1788
  return $self->{_hgvs_exon_start_coordinate};
1789
}
1790

1791
=head2 hgvs_intron_start_offset
1792

1793
  Description: Return the HGVS intron start offset
1794
  Returntype : int or undef if HGVS has not been calculated
1795
  Exceptions : none
1796
  Status     : At risk
1797

1798
=cut
17✔
1799

1800
sub hgvs_intron_start_offset {
17✔
1801
  my $self = shift;
1802
  return $self->{_hgvs_intron_start_offset};
1803
}
1804

1805
=head2 hgvs_exon_end_coordinate
1806

1807
  Description: Return the HGVS exon end coordinate
1808
  Returntype : int or undef if HGVS has not been calculated
1809
  Exceptions : none
1810
  Status     : At risk
1811

1812
=cut
1813

1✔
1814
sub hgvs_exon_end_coordinate {
1✔
1815
  my $self = shift;
1816
  return $self->{_hgvs_exon_end_coordinate};
1817
}
1818

1819
=head2 hgvs_intron_end_offset
1820

1821
  Description: Return the HGVS intron end offset
1822
  Returntype : int or undef if HGVS has not been calculated
1823
  Exceptions : none
1824
  Status     : At risk
1825

1826
=cut
1827

1✔
1828
sub hgvs_intron_end_offset {
1✔
1829
  my $self = shift;
1830
  return $self->{_hgvs_intron_end_offset};
1831
}
1832

1833
### HGVS: format protein string
1834
sub _get_hgvs_protein_format {
1835
  my $self          = shift;
1836
  my $hgvs_notation = shift;
1837
  my $prediction_format = shift;
1838
  my $convert_to_three_letter = shift;
1839

1840
  ### all start with refseq name & numbering type
1841
  $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . ".";
1✔
1842

1✔
1843
  ### add paranthesis if asked to report in predicted format
1844
  $hgvs_notation->{'hgvs'} .= "(" if $prediction_format;
1845

1846
  ### New (v 15.11) way to describe synonymous changes
1847
  if( $hgvs_notation->{ref} eq $hgvs_notation->{alt} && $hgvs_notation->{type} ne "fs" && $hgvs_notation->{type} ne "ins"){
1848
    $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "=";
1849
  }
1850

1851
  ### handle stop_lost seperately regardless of cause by del/delins => p.TerposAA1extnum_AA_to_stop
1852
  elsif(stop_lost($self) && ($hgvs_notation->{type} eq "del" || $hgvs_notation->{type} eq ">" )) {
1853
    ### if deletion of stop add extTer and number of new aa to alt
1854

1855
    $hgvs_notation->{alt} = $convert_to_three_letter ? substr($hgvs_notation->{alt}, 0, 3) : substr($hgvs_notation->{alt}, 0, 1);
1✔
1856
    print "stop loss check req for $hgvs_notation->{type}\n" if $DEBUG ==1;
1✔
1857

1858
    my $aa_til_stop =  $self->_stop_loss_extra_AA($hgvs_notation->{start}-1 );
1859
    ### use ? to show new stop not predicted
1860
    $aa_til_stop = "?" unless defined $aa_til_stop ;
1861

133✔
1862
    $hgvs_notation->{alt} .=  "extTer" . $aa_til_stop;
133✔
1863

133✔
1864
    # 2020-07-14
1865
    if($convert_to_three_letter && length($hgvs_notation->{ref}) >3 && $hgvs_notation->{type} eq "del" ) {
1866
      my $ref_pep_first = $convert_to_three_letter ? substr($hgvs_notation->{ref}, 0, 3) : substr($hgvs_notation->{ref}, 0, 1);
133✔
1867
      my $ref_pep_last  = $convert_to_three_letter ? substr($hgvs_notation->{ref}, -3, 3) : substr($hgvs_notation->{ref}, -1, 1);
1868
      $hgvs_notation->{'hgvs'} .=  $ref_pep_first . $hgvs_notation->{start} .  "_" .  $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ;
1869
    }
133✔
1870
    else{
1871
      $hgvs_notation->{'hgvs'} .=  $hgvs_notation->{ref} . $hgvs_notation->{start} .  $hgvs_notation->{alt} ;
1872
    }
133✔
1873
  }
21✔
1874

1875
  elsif( $hgvs_notation->{type} eq "dup"){
1876
    if($hgvs_notation->{start} < $hgvs_notation->{end}){
1877
      ### list only first and last peptides in long duplicated string
1878
      my $ref_pep_first = $convert_to_three_letter ? substr($hgvs_notation->{alt}, 0, 3) : substr($hgvs_notation->{ref}, 0, 1);
1879
      my $ref_pep_last  = $convert_to_three_letter ? substr($hgvs_notation->{alt}, -3, 3) : substr($hgvs_notation->{ref}, -1, 1);
1880
      $hgvs_notation->{'hgvs'} .=  $ref_pep_first . $hgvs_notation->{start} .  "_" .  $ref_pep_last . $hgvs_notation->{end} ."dup";
8✔
1881
    }
8✔
1882
    else{
1883
      $hgvs_notation->{'hgvs'} .=  $hgvs_notation->{alt} . $hgvs_notation->{start} .  "dup" ;
8✔
1884
    }
1885

8✔
1886
    print "formating a dup  $hgvs_notation->{hgvs} \n" if $DEBUG==1;
1887
  }
8✔
1888

1889
  elsif($hgvs_notation->{type} eq ">"){
1890
    #### substitution
8✔
1891
    $hgvs_notation->{'hgvs'}  .=   $hgvs_notation->{ref}. $hgvs_notation->{start} .  $hgvs_notation->{alt};
×
1892
  }    
×
1893

×
1894
  elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){
1895

1896
    ## don't report other peptides after a stop is found
8✔
1897
    $hgvs_notation->{alt} =~ s/Ter\w+/Ter/ ;
1898

1899
    #### list first and last aa in reference only
1900
    my $ref_pep_first = $convert_to_three_letter ? substr($hgvs_notation->{ref}, 0, 3) : substr($hgvs_notation->{ref}, 0, 1);
1901
    my $ref_pep_last;
1902
    if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){
1903
      $ref_pep_last ="Ter";
×
1904
    }
1905
    else{
×
1906
      $ref_pep_last  = $convert_to_three_letter ? substr($hgvs_notation->{ref}, -3, 3) : substr($hgvs_notation->{ref}, -1, 1);
×
1907
    }
×
1908

1909
    if($hgvs_notation->{ref} =~ /X$/) {
1910
      ### For stops & add extX & distance to next stop to alt pep
×
1911
      my $aa_til_stop =  $self->_stop_loss_extra_AA( $hgvs_notation->{start}-1, "loss");
1912
      if(defined  $aa_til_stop){  
1913
         $hgvs_notation->{alt} .="extTer" . $aa_til_stop;
×
1914
      }
1915
    }
1916

1917

1918
    if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){       
34✔
1919
       $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start}  . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
1920
    }
1921
    else{
1922
      ### correct ordering if needed
1923
      if($hgvs_notation->{start} > $hgvs_notation->{end}){
1924
        ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} );
26✔
1925
      }
1926

1927
      $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start}  . "_"  .  $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
26✔
1928
    }
26✔
1929
  }     
26✔
1930
  
×
1931
  elsif($hgvs_notation->{type} eq "fs"){
1932

1933
    if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "Ter"){ ## stop gained
26✔
1934
      ## describe as substitution if stop occurs immediately
1935
      $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start}  .  $hgvs_notation->{alt} ;
1936
    }
26✔
1937
    else{ 
1938
      ## not immediate stop - count aa until next stop
×
1939
      my $aa_til_stop =  $self->_stop_loss_extra_AA( $hgvs_notation->{start}-1, "fs");
×
1940

×
1941
      ### use ? to show new stop not predicted
1942
      $aa_til_stop = "?" unless defined $aa_til_stop; 
1943

1944
      $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ."fsTer$aa_til_stop";     
1945
    }
26✔
1946
  }
4✔
1947

1948
  elsif( $hgvs_notation->{type} eq "del"){
1949
    $hgvs_notation->{alt} =  "del"; 
1950
    if( length($hgvs_notation->{ref}) >3 ){
22✔
1951
      my $ref_pep_first = $convert_to_three_letter ? substr($hgvs_notation->{ref}, 0, 3) : substr($hgvs_notation->{ref}, 0, 1);
16✔
1952
      my $ref_pep_last  = $convert_to_three_letter ? substr($hgvs_notation->{ref}, -3, 3) : substr($hgvs_notation->{ref}, -1, 1);
1953
      $hgvs_notation->{'hgvs'} .=  $ref_pep_first . $hgvs_notation->{start} .  "_" .  $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ;
1954
    }
22✔
1955
    else{
1956
      $hgvs_notation->{'hgvs'} .=  $hgvs_notation->{ref} . $hgvs_notation->{start} .  $hgvs_notation->{alt} ;
1957
    }       
1958
  }
1959

1960
  elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){
28✔
1961
    $hgvs_notation->{'hgvs'}  .=  $hgvs_notation->{ref} . $hgvs_notation->{start}  . "_" .  $hgvs_notation->{alt} . $hgvs_notation->{end} ;
1962
  }
8✔
1963

1964
  else{
1965
  #### default to substitution    
1966
    $hgvs_notation->{'hgvs'}  .=   $hgvs_notation->{ref}. $hgvs_notation->{start} .  $hgvs_notation->{alt};
20✔
1967
  }
1968

1969
  ### add paranthesis if asked to report in predicted format
20✔
1970
  $hgvs_notation->{'hgvs'} .= ")" if $prediction_format;
1971
  
20✔
1972
  if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";}
1973
  return $hgvs_notation->{'hgvs'};
1974
}
1975

1976
### HGVS: get type of variation event in protein terms
12✔
1977
sub _get_hgvs_protein_type {
12✔
1978
  my $self = shift;
1✔
1979
  my $hgvs_notation = shift;
1✔
1980

1✔
1981
  if($DEBUG==1){ print "starting get_hgvs_protein_type \n";}
1982

1983
  if( frameshift($self) ){
11✔
1984
    $hgvs_notation->{type} = "fs";
1985
    return $hgvs_notation;
1986
  }
1987

1988
  if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){
×
1989
  ### Run type checks on peptides if available
1990
    $hgvs_notation->{ref} =~ s/\*/X/;
1991
    $hgvs_notation->{alt} =~ s/\*/X/;
1992

1993
    if($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") {
4✔
1994
      $hgvs_notation->{type} = "ins";
1995
    }
1996
    elsif($hgvs_notation->{alt} eq "" || $hgvs_notation->{alt} eq "-") {
1997
      $hgvs_notation->{type} = "del";
133✔
1998
    }
1999
    elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) {
133✔
2000
      $hgvs_notation->{type} = ">";
133✔
2001
    }
2002
    elsif(
2003
      ((length($hgvs_notation->{alt}) >0 && length($hgvs_notation->{ref}) >0) &&
2004
      (length($hgvs_notation->{alt}) ne length($hgvs_notation->{ref})) )  ||
2005
      (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) >1)     ## not a substitution if >1 aa switched
133✔
2006
    ) {
133✔
2007
      $hgvs_notation->{type} = "delins";
2008
    }
133✔
2009
    else{
2010
      $hgvs_notation->{type} = ">";
133✔
2011
    }
28✔
2012
  }
28✔
2013
  else {
2014
    ### Cannot define type from peptides - check at DNA level
2015
    ### get allele length from dna seq & cds length
105✔
2016
    my ($ref_length, $alt_length ) = $self->_get_allele_length(); 
2017

105✔
2018
    if($alt_length >1  ){
105✔
2019
      if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){
2020
        ### convention for insertions - end one less than start
105✔
2021
        $hgvs_notation->{type} = "ins";
16✔
2022
      }
2023
      elsif( $hgvs_notation->{start} != $hgvs_notation->{end}  ){
2024
        $hgvs_notation->{type} = "delins";
14✔
2025
      }
2026
      else{
2027
        $hgvs_notation->{type} = ">";
65✔
2028
      } 
2029
    }
2030
    elsif($ref_length >1  ){
2031
      $hgvs_notation->{type} = "del"; 
2032
    }
2033
 
2034
    else{
10✔
2035
      #print STDERR "DEBUG ".$self->variation_feature->start."\n";
2036
      #warn "Cannot define protein variant type [$ref_length  - $alt_length]\n";
2037
    }
×
2038
  }
2039

2040
  return $hgvs_notation;
2041
}
2042

2043
### HGVS: get reference & alternative peptide 
×
2044
sub _get_hgvs_peptides {
2045
  my $self          = shift;
×
2046
  my $hgvs_notation = shift;
×
2047
  my $convert_to_three_letter = shift;
2048

×
2049
  if($hgvs_notation->{type} eq "fs"){
2050
    ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately
2051
    $hgvs_notation = $self->_get_fs_peptides($hgvs_notation); 
×
2052
    return undef unless defined $hgvs_notation->{type};   
2053
  }
2054
  elsif($hgvs_notation->{type} eq "ins" ){
×
2055

2056
    ### Check if bases directly after insertion match inserted sequence
2057
    $hgvs_notation = $self->_check_peptides_post_var($hgvs_notation);
2058

×
2059
    ### Check that inserted bases do not duplicate 3' reference sequence [set to type = dup and return if so]
2060
    $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation) unless $hgvs_notation->{alt} =~/\*/;
2061
    return ($hgvs_notation) if $hgvs_notation->{type} eq "dup";              
2062

2063
    ### HGVS ref are peptides flanking insertion
2064
    my $min;
2065
    if($hgvs_notation->{start} < $hgvs_notation->{end}){
2066
        $min = $hgvs_notation->{start};
2067
    }
105✔
2068
    else{ $min = $hgvs_notation->{end};}
2069

2070
    $hgvs_notation->{ref} = $self->_get_surrounding_peptides(
2071
      $min, 
2072
      $hgvs_notation->{original_ref},
133✔
2073
      2
133✔
2074
    );
133✔
2075

2076
    return undef unless $hgvs_notation->{ref};
133✔
2077
  }
2078
  elsif($hgvs_notation->{type} eq "del" ){
28✔
2079
    ##check if bases directly after deletion match the deletion
28✔
2080
    $hgvs_notation = $self->_check_peptides_post_var($hgvs_notation);
2081
  }
2082

2083
  ### Convert peptide to 3 letter code as used in HGVS
2084
  if ( $convert_to_three_letter ){
16✔
2085
      $hgvs_notation->{ref}  = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref',  -alphabet => 'protein')) || "" unless ($hgvs_notation->{ref} eq "-");
2086
      $hgvs_notation->{alt}  = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref',  -alphabet => 'protein')) || "" unless ($hgvs_notation->{alt} eq "-");
2087
  }
16✔
2088
  $hgvs_notation->{alt} = "del" if $hgvs_notation->{alt} eq "-"; 
16✔
2089

2090
  ### handle special cases
2091
  if( start_lost($self) ){
16✔
2092
    #### handle initiator loss -> probably no translation => alt allele is '?'
16✔
2093
    $hgvs_notation->{alt}  = "?";    
×
2094
    $hgvs_notation->{type} = "";
2095
  }
16✔
2096

2097
  elsif(  $hgvs_notation->{type} eq "del"){ 
16✔
2098
    if( $hgvs_notation->{ref} =~/\w+/){
2099
      $hgvs_notation->{alt} = "del";
2100
    }
2101
    else{
2102
      $hgvs_notation = $self->_get_del_peptides($hgvs_notation);
2103
    }
16✔
2104
  } 
2105
  elsif($hgvs_notation->{type} eq "fs"){
2106
    ### only quote first ref peptide for frameshift
2107
    $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3);
14✔
2108
  }
2109

2110
  ### 2012-08-31  - Ter now recommended in HGVS
2111
  if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Xaa/Ter/g; }
133✔
2112
  if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Xaa/Ter/g; }
133✔
2113

133✔
2114
  return ($hgvs_notation);           
133✔
2115
}
2116

133✔
2117
### HGVS:  remove common peptides/nucleotides from alt and ref strings & shift start/end accordingly 
2118
sub _clip_alleles {
2119
    
133✔
2120
  my $hgvs_notation = shift;
2121
    
4✔
2122
  my $check_alt   = $hgvs_notation->{alt};
4✔
2123
  my $check_ref   = $hgvs_notation->{ref};
2124
  my $check_start = $hgvs_notation->{start};
2125
  my $check_end   = $hgvs_notation->{end};
2126

14✔
2127
  ## cache this - if stop needed later
14✔
2128
  $hgvs_notation->{original_ref} = $hgvs_notation->{ref};
2129

2130
  ## store identical trimmed seq 
×
2131
  my $preseq = "";
2132
  print "can we clip :  $check_ref &  $check_alt\n" if $DEBUG ==1;
2133
  ### strip same bases from start of string
2134
  for (my $p =0; $p <length ($hgvs_notation->{ref}); $p++){
2135
    my $check_next_ref = substr( $check_ref, 0, 1);
28✔
2136
    my $check_next_alt = substr( $check_alt, 0, 1);
2137
    
2138
    ### stop re-created by variant - no protein change
2139
    if( defined $hgvs_notation->{'numbering'} &&
133✔
2140
        $hgvs_notation->{'numbering'} eq 'p' &&
133✔
2141
        $check_next_ref eq  "*" && $check_next_alt eq "*"){
2142
      $hgvs_notation->{type} = "=";
2143

133✔
2144
      return($hgvs_notation);
2145
    }
2146
    
2147
    if($check_next_ref eq  $check_next_alt){
2148
      $check_start++;
2149
      $check_ref  = substr( $check_ref, 1);
288✔
2150
      $check_alt  = substr( $check_alt, 1);
2151
      $preseq    .= $check_next_ref;
288✔
2152
    }
288✔
2153
    else{
288✔
2154
      last;
288✔
2155
    }
2156
  }
2157

288✔
2158
  my $len = length ($check_ref);
2159
  #### strip same bases from end of string
2160

288✔
2161
  for (my $q =0; $q < $len; $q++) {
288✔
2162
    my $check_next_ref = substr( $check_ref, -1, 1);
2163
    my $check_next_alt = substr( $check_alt, -1, 1);
288✔
2164
    if($check_next_ref eq  $check_next_alt){
258✔
2165
      chop $check_ref;
258✔
2166
      chop $check_alt;
2167
      $check_end--;
258✔
2168
    }
2169
    else{
2170
      last;
×
2171
    }
2172
  }
×
2173
  
2174
  ## ammend positions & ref/alt
2175
  $hgvs_notation->{alt}   = $check_alt;
258✔
2176
  $hgvs_notation->{ref}   = $check_ref;
14✔
2177
  $hgvs_notation->{start} = $check_start;
14✔
2178
  $hgvs_notation->{end}   = $check_end;
14✔
2179
  $hgvs_notation->{preseq} =   $preseq ;
14✔
2180

2181
  ### check if clipping suggests a type change 
2182
  
244✔
2183
  ## no protein change - use transcript level annotation 
2184
  if( defined $hgvs_notation->{'numbering'} &&
2185
        $hgvs_notation->{'numbering'} eq 'p'&&
2186
        $check_ref eq $check_alt) {
288✔
2187
      $hgvs_notation->{type} = "=";
2188
  }   
2189
  
288✔
2190
  ## re-set as > not delins
251✔
2191
  elsif( $check_ref ne "-" && 
251✔
2192
        length ($check_ref) == 1 && 
251✔
2193
        length ($check_alt) == 1 && 
9✔
2194
        $hgvs_notation->{alt} ne $hgvs_notation->{ref}) {
9✔
2195
      
9✔
2196
      $hgvs_notation->{type} = ">";
2197
  }
2198
  
242✔
2199
  
2200
  ### re-set as ins/dup not delins 
2201
  elsif(length ($check_ref) == 0 && length ($check_alt) >= 1){
2202
      ### re-set as dup not delins (ignore for protein as it is checked later on)
2203
      my $prev_str = substr($preseq, length($preseq) - length($check_alt));
288✔
2204
      if( defined $hgvs_notation->{'numbering'} &&
288✔
2205
          $hgvs_notation->{'numbering'} ne 'p' &&
288✔
2206
          $check_alt eq $prev_str) {
288✔
2207
            $hgvs_notation->{type} = "dup";
288✔
2208
            $hgvs_notation->{start} -= length($check_alt);
2209
      }
2210
    
2211
      ### re-set as ins not delins
2212
      else {
288✔
2213
        $hgvs_notation->{type} ="ins";
×
2214
      }
2215
  }
2216
  
2217
  ### re-set as del not delins  
2218
  elsif(length ($check_ref) >=1 && length ($check_alt) == 0){
2219
    $hgvs_notation->{type}  = "del" ;      
2220
  }
2221

2222
  print "clipped :  $check_ref &  $check_alt\n" if $DEBUG ==1;
159✔
2223

2224
  return $hgvs_notation;
2225
}
2226
    
2227

2228

2229

46✔
2230

46✔
2231
#### HGVS: check allele lengths to look for frameshifts
×
2232
sub _get_allele_length {
×
2233
  my $self       = shift;
2234
  my $ref_length = 0;
2235
  my $alt_length = 0;
2236

2237
  my $al_string = $self->allele_string();
46✔
2238
  my $ref_allele = (split/\//, $al_string)[0];
2239
  $ref_allele =~ s/\-//;
2240
  $ref_length = length $ref_allele;
2241

2242
  my $alt_allele = $self->variation_feature_seq();
2243
  $alt_allele =~ s/\-//;
43✔
2244
  $alt_length = length $alt_allele;
2245

2246
  return ($ref_length, $alt_length );  
288✔
2247
}
2248

288✔
2249
### HGVS: list first different peptide [may not be first changed codon]
2250
sub _get_fs_peptides {
2251
  my $self    = shift;
2252
  my $hgvs_notation = shift;
2253

2254
  ### get CDS with alt variant
2255
  my $alt_cds = $self->_get_alternate_cds();
2256
  return undef unless defined($alt_cds);
2257
  unless ($alt_cds->seq =~ /([ACGT-])+/) {
×
2258
    warn('Alternate CDS not found on seq region ' . $self->variation_feature->seq_region_name . '. Are you missing a synonyms file?');
×
2259
    return undef;
×
2260
  }
2261

×
2262
  #### get new translation
×
2263
  my $alt_trans = $alt_cds->translate()->seq();
×
2264

×
2265
  ### get changed end (currently in single letter AA coding)    
2266
  my $ref_trans  = $self->transcript_variation->_peptide;
×
2267

×
2268
  $ref_trans    .= "*";   ## appending ref stop for checking purposes 
×
2269
  
2270
  $hgvs_notation->{start} = $self->transcript_variation->translation_start() ;
×
2271

2272
  if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq 
2273
    $hgvs_notation->{alt}  = "del";
2274
    $hgvs_notation->{type} = "del";
2275
    return $hgvs_notation;
28✔
2276
  }
28✔
2277

2278
  while ($hgvs_notation->{start} <= length $alt_trans){
2279
    ### frame shift may result in the same AA#
28✔
2280

28✔
2281
    $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1);
28✔
2282
    $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1);
×
2283

×
2284
    if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){
2285
      ### variation at stop codon, but maintains stop codon - set to synonymous
2286
      $hgvs_notation->{type} = "=";
2287
      return ($hgvs_notation);
28✔
2288
    }
2289

2290
    last if $hgvs_notation->{ref} ne $hgvs_notation->{alt};
28✔
2291
    $hgvs_notation->{start}++;
2292
  }
28✔
2293

2294
  return ($hgvs_notation);
28✔
2295
}
2296

28✔
2297
#### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion
×
2298
sub _get_surrounding_peptides {
×
2299
  my $self    = shift;
×
2300
  my $ref_pos = shift; 
2301
  my $original_ref = shift;
2302
  my $length  = shift;
28✔
2303

2304
  my $ref_trans  = $self->transcript_variation->_peptide();
2305
  $ref_trans .= $original_ref
28✔
2306
    if defined $original_ref && $original_ref =~ /^\*/;
28✔
2307

2308
  ## can't find peptide after the end
28✔
2309
  return if length($ref_trans) <=  $ref_pos ;
2310

×
2311
  my $ref_string;
×
2312
  if(defined $length) {
2313
    $ref_string = substr($ref_trans, $ref_pos-1, $length);
2314
  }
28✔
2315
  else{
×
2316
    $ref_string = substr($ref_trans, $ref_pos-1 );
2317
  }
2318

28✔
2319
  return ($ref_string);
2320
}
2321

2322

2323
#### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop
46✔
2324
sub _get_alternate_cds {
46✔
2325
    
46✔
2326
  my $self = shift;
46✔
2327

2328
  ### get reference sequence
46✔
2329
  my $reference_cds_seq = $self->transcript_variation->_translateable_seq();
46✔
2330
  
2331
  my $tv = $self->transcript_variation;
2332
  my $vf = $tv->variation_feature;
2333
  my $tr = $tv->transcript;
46✔
2334
  my $shifting_offset = (defined($self->{shift_hash}) && defined($self->{shift_hash}->{shift_length})) ? $self->{shift_hash}->{shift_length} : 0;
2335
  
33✔
2336
  return undef unless defined($tv->cds_start(undef, $tr->strand * $shifting_offset)) && defined($tv->cds_end(undef, $tr->strand * $shifting_offset));
33✔
2337

16✔
2338
  ### get sequences upstream and downstream of variant
2339
  my $upstream_seq   =  substr($reference_cds_seq, 0, ($tv->cds_start(undef, $tr->strand * $shifting_offset) -1) );
2340
  my $downstream_seq =  substr($reference_cds_seq, ($tv->cds_end(undef, $tr->strand * $shifting_offset)) );
17✔
2341
  return undef unless defined($downstream_seq) && defined($upstream_seq);
2342
  
2343
  ### fix alternate allele if deletion or on opposite strand
33✔
2344
  my $alt_allele  = $self->variation_feature_seq();
2345
  $alt_allele  =~ s/\-//;
2346
  if($alt_allele && $vf->strand() != $tr->strand()){    
2347
    reverse_comp(\$alt_allele) ;
2348
  }
2349

2350
  ### build alternate seq
167✔
2351
  my $alternate_seq  = $upstream_seq . $alt_allele . $downstream_seq ;
2352
  $alternate_seq  = $self->_trim_incomplete_codon($alternate_seq );
2353

167✔
2354
  ### create seq obj with alternative allele in the CDS sequence
2355
  my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq,  -id => 'alt_cds', -alphabet => 'dna');
167✔
2356

167✔
2357
  ### append UTR if available as stop may be disrupted
167✔
2358
  my $utr = $self->transcript_variation->_three_prime_utr();
167✔
2359

2360
  if (defined $utr) {
167✔
2361
  ### append the UTR to the alternative CDS 
2362
    $alt_cds->seq($alt_cds->seq() . $utr->seq()); 
2363
  }
167✔
2364
  else{
167✔
2365
   ##warn "No UTR available for alternate CDS\n";
167✔
2366
  }
2367

2368
  return $alt_cds;
167✔
2369
}
167✔
2370

167✔
2371
### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication
11✔
2372
sub _check_for_peptide_duplication {    
2373
  my $self = shift;
2374
  my $hgvs_notation = shift;
2375

167✔
2376
  ##### get reference sequence
167✔
2377
  my $reference_cds_seq = $self->transcript_variation->_translateable_seq();
2378

2379
  my $reference_cds = Bio::PrimarySeq->new(-seq => $reference_cds_seq,  -id => 'alt_cds', -alphabet => 'dna');
167✔
2380
  my $reference_trans = $reference_cds->translate()->seq();
2381

2382
  ##### get sequence upstream of variant - use hgvs start; may have been shifted
167✔
2383
  my $upstream_trans  = substr($reference_trans, 0, ($hgvs_notation->{'start'} -1) );
2384
  print "Checking for peptide duplication: $hgvs_notation->{alt} vs $upstream_trans  $hgvs_notation->{preseq} \n" if $DEBUG ==1;
167✔
2385

2386
  $upstream_trans  .= $hgvs_notation->{preseq} if defined $hgvs_notation->{preseq}; ## add back on anything previously chopped off ref allele
162✔
2387

2388
  ## Test whether alt peptide matches the reference sequence just before the variant
2389
  my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ;
2390

2391
  if( (length($upstream_trans) >=  $test_new_start + length($hgvs_notation->{'alt'}) ) && $test_new_start  >=0){
2392
    my $test_seq       =  substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'}));
167✔
2393

2394
    if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) {           
2395
      $hgvs_notation->{type}   = 'dup';
2396
      $hgvs_notation->{end}    = $hgvs_notation->{start} -1;
2397
      $hgvs_notation->{start} -= length($hgvs_notation->{alt});
12✔
2398

12✔
2399
      ## convert to 3 letter code
2400
      $hgvs_notation->{alt}  = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref',  -alphabet => 'protein')) || "";
2401
    }
12✔
2402
  }
2403
  return $hgvs_notation;
12✔
2404
}
12✔
2405

2406
#### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's
2407
sub _stop_loss_extra_AA{
12✔
2408

12✔
2409
  my $self        = shift;
2410
  my $ref_var_pos = shift;  ### first effected AA - supply for frameshifts
12✔
2411
  my $test        = shift;
2412

2413
  return undef unless $ref_var_pos;
12✔
2414

2415
  my $extra_aa;
12✔
2416

12✔
2417
  ### get the sequence with variant added
2418
  my $alt_cds   = $self->_get_alternate_cds();
12✔
2419
  return undef unless defined($alt_cds);
×
2420
  
×
2421
  ### get new translation
×
2422
  my $alt_trans = $alt_cds->translate();
2423

2424
  my $ref_temp  =  $self->transcript_variation->_peptide();
×
2425
  my $ref_len = length($ref_temp);
2426
  
2427
  if($DEBUG==1){ 
12✔
2428
    print "alt translated:\n" . $alt_trans->seq() . "\n";
2429
    print "ref translated:\n$ref_temp\n";;
2430
  }
2431
  
2432
  #### Find the number of residues that are translated until a termination codon is encountered
2433
  if ($alt_trans->seq() =~ m/\*/) {
28✔
2434
    if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";}
28✔
2435
  
28✔
2436
    if(defined $test && $test eq "fs" ){
2437
      ### frame shift - count from first AA effected by variant to stop
28✔
2438
      $extra_aa = $+[0] - $ref_var_pos;
2439
      if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";}
28✔
2440
    }
2441
  
2442
    else{
28✔
2443
      $extra_aa = $+[0]  - 1 - $ref_len;
28✔
2444
      if($DEBUG==1){ print "Stop change (non-fs): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";}        
2445
    }
2446
  }
28✔
2447
  
2448
  # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon
28✔
2449
  if(defined $extra_aa && $extra_aa >0){ 
28✔
2450
    return $extra_aa ;
2451
  }
28✔
2452
  else{ 
×
2453
    #warn "No stop found in alternate sequence\n";
×
2454
    return undef;
2455
  }
2456
}
2457

28✔
2458
## doing this to stop final incomplete codons being guessed
26✔
2459
sub _trim_incomplete_codon{
2460
  my $self = shift;
26✔
2461
  my $seq  = shift;
2462

20✔
2463
  return undef unless $seq;
20✔
2464

2465
  my $full_length = length $seq;
2466
  my $keep_length = $full_length -  $full_length % 3;
2467
  return $seq if $full_length = $keep_length ;
6✔
2468

6✔
2469
  return substr($seq, 0, $keep_length);
2470
}
2471

2472

2473
## This is used for rare in-frame deletions removing an intron and part of both surrounding exons
28✔
2474
sub _get_del_peptides{
26✔
2475

2476
  my $self    = shift;
2477
  my $hgvs_notation = shift;
2478

2✔
2479
  ### get CDS with alt variant
2480
  my $alt_cds = $self->_get_alternate_cds();
2481
  return undef unless defined($alt_cds);
2482

2483
  #### get new translation
2484
  my $start = $self->transcript_variation->translation_start() - 1;
167✔
2485
  my $alt = substr($alt_cds->translate()->seq(), $start );
167✔
2486
  $hgvs_notation->{alt} = (split/\*/, $alt)[0];
2487

167✔
2488
  ### get changed end (currently in single letter AA coding)    
2489
  $hgvs_notation->{ref}  = substr($self->transcript->translate()->seq(), $start );
167✔
2490

167✔
2491
  $hgvs_notation->{start} = $self->transcript_variation->translation_start() ;
167✔
2492

2493
  $hgvs_notation = _clip_alleles($hgvs_notation);
×
2494

2495
  ## switch to 3 letter AA coding
2496
  $hgvs_notation->{alt}  = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref',  -alphabet => 'protein')) || "";
2497
  $hgvs_notation->{ref}  = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref',  -alphabet => 'protein')) || "";
2498
  return $hgvs_notation;
2499
}
2500

×
2501
## HGVS counts from first different peptide, 
×
2502
## so check the sequence post variant and increment accordingly
2503
sub _check_peptides_post_var{
2504
  my $self          = shift;
×
2505
  my $hgvs_notation = shift;
×
2506

2507
  ## check peptides after deletion 
2508
  my $post_pos = $hgvs_notation->{end}+1;
×
2509
  my $post_seq = $self->_get_surrounding_peptides(
×
2510
    $post_pos,
×
2511
    $hgvs_notation->{original_ref}
2512
  );
2513

×
2514
  ## if a stop is deleted and no sequence is available beyond to check, return
2515
  return $hgvs_notation unless defined $post_seq;
×
2516

2517
  $hgvs_notation = _shift_3prime($hgvs_notation, $post_seq);
×
2518
  return $hgvs_notation;
2519
}
2520

×
2521
## HGVS aligns changes 3' (alt string may look different at transcript level to genomic level)
×
2522
## AAC[TG]TAT => AACT[GT]AT
×
2523
## TTA[GGG]GGTTTA =>TTAGG[GGG]TTTA
2524

2525
sub _shift_3prime{
2526

2527
  my $hgvs_notation = shift;
2528
  my $post_seq      = shift;
30✔
2529

30✔
2530
  my $seq_to_check;
2531
  if( $hgvs_notation->{type} eq 'ins'){
2532
    $seq_to_check = $hgvs_notation->{alt};
30✔
2533
  }
30✔
2534
  elsif ($hgvs_notation->{type} eq 'del'){
2535
    $seq_to_check = $hgvs_notation->{ref};
2536
  }
2537
  else{
2538
    return $hgvs_notation;
2539
  }
30✔
2540

2541
  ## return if nothing to check
17✔
2542
  return $hgvs_notation unless defined $post_seq && defined $seq_to_check;
17✔
2543

2544
  ## get length of pattern to check 
2545
  my $deleted_length = (length $seq_to_check);
2546
  
2547
  # warn "Checking $seq_to_check v $post_seq\n";
2548
  ## move along sequence after deletion looking for match to start of deletion
2549
  for (my $n = 0; $n<= (length($post_seq) - $deleted_length); $n++ ){
2550

2551
    ## check each position in deletion/ following seq for match
17✔
2552
    my $check_next_del  = substr( $seq_to_check, 0, 1);
17✔
2553
    my $check_next_post = substr( $post_seq, $n, 1);
2554

17✔
2555
    if($check_next_del eq $check_next_post){
17✔
2556

16✔
2557
      ## move position of deletion along
2558
      $hgvs_notation->{start}++;
2559
      $hgvs_notation->{end}++;
1✔
2560
          
2561
      ## modify deleted sequence - remove start & append to end
2562
      $seq_to_check = substr($seq_to_check,1);
×
2563
      $seq_to_check .= $check_next_del;
2564
    }
2565
    else{
2566
      last;            
17✔
2567
    }
2568
  }
2569
  ## set new HGVS string
17✔
2570
  $hgvs_notation->{alt} = $seq_to_check if $hgvs_notation->{type} eq 'ins';
2571
  $hgvs_notation->{ref} = $seq_to_check if $hgvs_notation->{type} eq 'del';
2572

2573
  return $hgvs_notation;
17✔
2574
}
2575
=head
2576
# We haven't implemented support for these methods yet
17✔
2577

17✔
2578
sub hgvs_rna {
2579
    return _hgvs_generic(@_,'rna');
17✔
2580
}
2581

2582
sub hgvs_mitochondrial {
×
2583
    return _hgvs_generic(@_,'mitochondrial');
×
2584
}
2585

2586
=cut
×
2587

×
2588
sub _hgvs_generic {
2589
  my $self = shift;
2590
  my $reference = pop;
17✔
2591
  my $notation = shift;
2592

2593
  #The rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these
2594
  return undef if ($reference =~ m/rna|mitochondrial/);
17✔
2595
  
17✔
2596
  my $sub = qq{hgvs_$reference};
2597
  
17✔
2598
  $self->{$sub} = $notation if defined $notation;
2599
  
2600
  unless ($self->{$sub}) {
2601

2602
    # Use the transcript this VF is on as the reference feature
2603
    my $reference_feature = $self->feature;
2604
    # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region
2605
    $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic');
2606

2607
    # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles
2608
    my $tv = $self->base_variation_feature_overlap;
2609
    my $vf = $self->base_variation_feature;
2610

2611
    $tv->$sub($vf->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$tv));
2612
  }
2613
  
6✔
2614
  return $self->{$sub};
6✔
2615
}
6✔
2616

2617

2618
### HGVS: move variant to transcript slice
6✔
2619
sub _var2transcript_slice_coords{
2620
  my ($self, $tr, $tv, $vf) = @_;
6✔
2621

2622
  $tv ||= $self->base_variation_feature_overlap;
6✔
2623
  $tr ||= $tv->transcript;
2624
  $vf ||= $tv->base_variation_feature;
6✔
2625
  
2626
  # what we want is VF coords relative to transcript feature slice
2627
  my ($tr_start, $tr_end) = ($tr->start, $tr->end);
3✔
2628
  my ($vf_start, $vf_end);
2629

3✔
2630
  # same slice, easy and we don't need to use transfer
2631
  if($NO_TRANSFER || $tr->slice eq $vf->slice) {
2632

3✔
2633
    # different transform depending on transcript strand
3✔
2634
    if($tr->strand < 1) {
2635

3✔
2636
      # note we switch start/end here
2637
      # this also works in the case of insertions thankfully
2638
      ($vf_start, $vf_end) = map {($tr_end - $_) + 1} ($vf->end, $vf->start) unless $vf->{shifted_flag};
6✔
2639
      ($vf_start, $vf_end) = map {($tr_end - $_) + 1} ($vf->{unshifted_end}, $vf->{unshifted_start}) if $vf->{shifted_flag};
2640
    }
2641
    else {
2642
      ($vf_start, $vf_end) = map {($_ - $tr_start) + 1} ($vf->start, $vf->end) unless $vf->{shifted_flag};
2643
      ($vf_start, $vf_end) = map {($_ - $tr_start) + 1} ($vf->{unshifted_start}, $vf->{unshifted_end}) if $vf->{shifted_flag};
2644
    }
206✔
2645
  }
2646

206✔
2647
  # different slices used to fetch features
206✔
2648
  # have to use transfer for safety
206✔
2649
  else {
2650
    my $tr_vf = $vf->transfer($self->_transcript_feature_Slice($tr));
2651
    return undef unless $tr_vf;
206✔
2652
    ($vf_start, $vf_end) = ($tr_vf->start, $tr_vf->end);
206✔
2653
  }
2654

2655
  # Return undef if this VariationFeature does not fall within the supplied feature.
206✔
2656
  return undef if (
2657
    $vf_start  < 1 || 
2658
    $vf_end    < 1 || 
205✔
2659
    $vf_start  > ($tr_end - $tr_start + 1) || 
2660
    $vf_end    > ($tr_end - $tr_start + 1)
2661
  ); 
2662
  
97✔
2663
  return( $vf_start , $vf_end, $self->_transcript_feature_Slice($tr));
97✔
2664
}
2665

2666

108✔
2667

108✔
2668
### HGVS: get variant position in transcript 
2669

2670
# intron: 
2671
# If the position is in an intron, the boundary position of the closest exon and 
2672
# a + or - offset into the intron is returned.
2673
# Ordered by genome forward not 5' -> 3'
2674

1✔
2675
# upstream:
1✔
2676
# If the position is 5' of the start codon, it is reported relative to the start codon 
1✔
2677
# (-1 being the last nucleotide before the 'A' of ATG).
2678

2679
#downstream:
2680
# If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset 
2681
# from the start codon (*1 being the first nucleotide after the last position of the stop codon)
206✔
2682

2683
sub _get_cDNA_position {
2684

2685
  my $self     = shift;
2686
  my $position = shift; ### start or end of variation
2687

206✔
2688
  my $tv         = $self->base_variation_feature_overlap;
2689
  my $transcript = $tv->transcript();
2690
  my $strand     = $transcript->strand();    
2691

2692
  #### TranscriptVariation start/stop coord relative to transcript 
2693
  #### Switch to chromosome coordinates taking into account strand
2694
  $position = ( $strand > 0 ? 
2695
    ( $transcript->start() + $position - 1 )  :   
2696
    ( $transcript->end()   - $position + 1));
2697

2698
  # Get all exons sorted in positional order
2699
  my $exons = $tv->_sorted_exons();
2700

2701
  my $n_exons = scalar(@$exons);
2702

2703
  my $cdna_position;
2704
  # Loop over the exons and get the coordinates of the variation in exon+intron notation
2705
  for (my $i=0; $i<$n_exons; $i++) {
2706

2707
    my $exon = $exons->[$i];
2708
    my ($exon_start, $exon_end) = ($exon->{start}, $exon->{end});
2709

257✔
2710
    # Skip if the start point is beyond this exon
257✔
2711
    next if ($position > $exon_end);
2712

257✔
2713

257✔
2714
    # EXONIC:  If the start coordinate is within this exon
257✔
2715
    if ($position >= $exon_start) {
2716
      # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation
2717
      # If the transcript is in the opposite direction, count from the end instead
2718
      $cdna_position = $self->_exon_cdna_start($exon, $transcript) + (
257✔
2719
        $strand > 0 ? 
2720
        ( $position - $exon_start ) : 
2721
        ( $exon_end - $position ) 
2722
      );
2723
      last;  #### last exon checked
257✔
2724
    }
2725

257✔
2726
    ## INTRONIC
2727
    # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one
257✔
2728
    else {
2729

257✔
2730
      my $prev_exon = $exons->[$i-1];
2731

1,358✔
2732
      my $updist   = ($position - $prev_exon->{end});
1,358✔
2733
      $updist =~ s/\-//; ## avoid problems with incomplete transcripts
2734
      my $downdist = ($exon_start - $position);
2735
      $downdist =~ s/\-//; ## avoid problems with incomplete transcripts
1,358✔
2736

2737
      # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that
2738
      if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) {
2739
        
257✔
2740
        # If the orientation is reversed, we should use the cDNA start and a '-' offset
2741
        $cdna_position = (
2742
          $strand >= 0 ? 
210✔
2743
          $self->_exon_cdna_end($prev_exon, $transcript).'+' : 
2744
          $self->_exon_cdna_start($prev_exon, $transcript).'-'
2745
        ).$updist;
2746
      }
2747
      # Else if downstream is shortest...
210✔
2748
      else {
2749
        # If the orientation is reversed, we should use the cDNA end and a '+' offset
2750
        $cdna_position = (
2751
          $strand >= 0 ?
2752
          $self->_exon_cdna_start($exon, $transcript).'-' : 
2753
          $self->_exon_cdna_end($exon, $transcript).'+'
2754
        ).$downdist;
47✔
2755
      }
2756

47✔
2757
      last; ## last exon checked
47✔
2758
    }
47✔
2759
  }
47✔
2760

2761
  ## this should not happen; potential refseq oddness
2762
  return undef unless $cdna_position; 
47✔
2763
 
2764
  # Shift the position to make it relative to the start & stop codons
2765
  my $start_codon  =  $transcript->cdna_coding_start();
25✔
2766
  my $stop_codon   =  $transcript->cdna_coding_end();
2767

2768
  # Disassemble the cDNA coordinate into the exon and intron parts
2769
  ### just built this now taking it appart again
2770
  my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/;
2771
  
2772

2773
  # Start by correcting for the stop codon
2774
  if (defined($stop_codon) ){
22✔
2775

2776
    if($cdna_coord > $stop_codon) {
2777
      # Get the offset from the stop codon
2778
      $cdna_coord -= $stop_codon;
2779
      # Prepend a * to indicate the position is in the 3' UTR
2780
      $cdna_coord = '*' . $cdna_coord;
2781
    }
47✔
2782
    elsif ( $cdna_coord eq $stop_codon && defined $intron_offset) {
2783
      $intron_offset =~ s/\+//g;
2784
      $cdna_coord ='' ;
2785

2786
      # Prepend a * to indicate the position is in the 3' UTR
257✔
2787
      $cdna_coord = '*' . $cdna_coord;
2788
    }
2789
  }
257✔
2790
  if (defined($start_codon) && $cdna_coord  !~/\*/) {
257✔
2791
    
2792
    # If the position is beyond the start codon, add 1 to get the correct offset
2793
    $cdna_coord += ($cdna_coord >= $start_codon);
2794
    # Subtract the position of the start codon
257✔
2795
    $cdna_coord -= $start_codon;
2796
  }
2797
  else{
2798
          print "Checking non-coding transcript\n" if $DEBUG==1;
257✔
2799
  }
2800

242✔
2801
  # Re-assemble the cDNA position  [ return exon num & offset & direction for intron eg. 142+363] 
2802
  $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : '');
11✔
2803

2804
  return $cdna_position;
11✔
2805
}
2806

2807
# $exon->cdna_start doesn't cache
×
2808
# so use our own method that does
×
2809
sub _exon_cdna_start {
2810
  my ($self, $exon, $tr) = @_;
2811

×
2812
  my $tr_stable_id = $tr->stable_id;
2813
  my $fc = $exon->{_variation_effect_feature_cache}->{$tr_stable_id} ||= {};
2814

257✔
2815
  if(!exists($fc->{_cdna_start})) {
2816
    $fc->{_cdna_start} = $exon->cdna_start($tr);
2817
  }
231✔
2818

2819
  return $fc->{_cdna_start};
231✔
2820
}
2821

2822
sub _exon_cdna_end {
26✔
2823
  my ($self, $exon, $tr) = @_;
2824

2825
  my $tr_stable_id = $tr->stable_id;
2826
  my $fc = $exon->{_variation_effect_feature_cache}->{$tr_stable_id} ||= {};
257✔
2827

2828
  if(!exists($fc->{_cdna_end})) {
257✔
2829
    $fc->{_cdna_end} = $exon->cdna_end($tr);
2830
  }
2831

2832
  return $fc->{_cdna_end};
2833
}
2834

228✔
2835
# same for $transcript->feature_Slice
2836
# need to be careful here in case the transcript has moved slice
228✔
2837
# you never know!
228✔
2838
sub _transcript_feature_Slice {
2839
  my ($self, $tr) = @_;
228✔
2840

170✔
2841
  my $fc = $tr->{_variation_effect_feature_cache} ||= {};
2842

2843
  # check that we haven't moved slice
228✔
2844
  my $curr_slice_ref = sprintf('%s', $tr->slice());
2845
  my $prev_slice_ref = $fc->{slice_ref};
2846

2847
  if(
29✔
2848
    !exists($fc->{feature_Slice}) ||
2849
    $fc->{slice_ref} && $fc->{slice_ref} ne $curr_slice_ref
29✔
2850
  ) {
29✔
2851

2852
    # log the reference of this slice
29✔
2853
    $fc->{slice_ref} = $curr_slice_ref;
25✔
2854
    $fc->{feature_Slice} = $tr->feature_Slice();
2855
  }
2856

29✔
2857
  return $fc->{feature_Slice};
2858
}
2859

2860

2861
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc