• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl-variation / 5104

pending completion
5104

push

travis-ci-com

web-flow
Merge pull request #922 from nakib103/update_gerp

Update gerp in VCF config

22251 of 26761 relevant lines covered (83.15%)

1446.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

6.62
/modules/Bio/EnsEMBL/Variation/Pipeline/TranscriptEffect.pm
1
=head1 LICENSE
2

3
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4
Copyright [2016-2022] EMBL-European Bioinformatics Institute
5

6
Licensed under the Apache License, Version 2.0 (the "License");
7
you may not use this file except in compliance with the License.
8
You may obtain a copy of the License at
9

10
   http://www.apache.org/licenses/LICENSE-2.0
11

12
Unless required by applicable law or agreed to in writing, software
13
distributed under the License is distributed on an "AS IS" BASIS,
14
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
See the License for the specific language governing permissions and
16
limitations under the License.
17

18
=cut
19

20

21
=head1 CONTACT
22

23
 Please email comments or questions to the public Ensembl
24
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25

26
 Questions may also be sent to the Ensembl help desk at
27
 <http://www.ensembl.org/Help/Contact>.
28

29
=cut
30

31
package Bio::EnsEMBL::Variation::Pipeline::TranscriptEffect;
32

33
use strict;
1✔
34
use warnings;
1✔
35

36
use Bio::EnsEMBL::Variation::TranscriptVariation;
1✔
37
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap);
1✔
38
use Bio::EnsEMBL::Variation::Utils::FastaSequence qw(setup_fasta);
1✔
39

40
use FileHandle;
1✔
41
use ImportUtils qw(load);
1✔
42
use Fcntl qw(:flock SEEK_END);
1✔
43

44
use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess);
1✔
45

46
my $DEBUG   = 0;
47

48
sub run {
49
  my $self = shift;
×
50

51
  my $disambiguate_sn_alleles = $self->param('disambiguate_single_nucleotide_alleles');
×
52
  my $mtmp = $self->param('mtmp_table');
×
53
  my $max_distance = $self->param('max_distance');
×
54
  my $by_transcript = ($self->param('analysis') eq 'by_transcript') ? 1 : 0;
×
55
  my $prevent_shifting = $self->param('prevent_shifting');
×
56

57

58
  my $variations_to_include;
×
59
  # if (my $vars = $self->param('variations_to_include')) {
60
  #   # turn the list of variation names into a hash to speed up checking
61
  #   $variations_to_include = { map { $_ => 1 } @$vars };
62
  # }
63

64
  # clear the registry here
65
  # this hopefully prevents any sequence caching issues
66
  # overhanging from previous jobs executed in the same hive process
67
  Bio::EnsEMBL::Registry->clear();
×
68

69
  my $core_dba = $self->get_species_adaptor('core');
×
70
  $core_dba->dbc->reconnect_when_lost(1);
×
71

72
  my $var_dba = $self->get_species_adaptor('variation');
×
73
  $var_dba->dbc->reconnect_when_lost(1);
×
74

75
  my $dbc = $var_dba->dbc();
×
76

77
  my $sa = $core_dba->get_SliceAdaptor;
×
78
  
79
  my $tva = $var_dba->get_TranscriptVariationAdaptor;
×
80
  $tva->db->include_failed_variations(1);
×
81

82
  if((my $fasta = $self->param('fasta')) && !$Bio::EnsEMBL::Slice::_fasta_redefined && !$Bio::EnsEMBL::Slice::fasta_db) {
×
83
    # we need to find the assembly version to tell it about PARs
84
    my ($highest_cs) = @{$core_dba->get_CoordSystemAdaptor->fetch_all()};
×
85
    my $assembly = $highest_cs->version();
×
86
    setup_fasta(-FASTA => $fasta, -ASSEMBLY => $assembly);
×
87
  }
88
  else {
89
    # set seq cache size higher
90
    # this prevents repeated DB lookups for sequence for HGVS e.g. in long introns
91
    # to do this we fetch a sequence adaptor
92
    my $seq_ad = $core_dba->get_SequenceAdaptor;
×
93
    # then reset its cache
94
    # the first param passed to this method is the "chunk power"
95
    # essentially the length in bp of cached sequence will be 2 ^ chunk_power
96
    $seq_ad->_init_seq_instance($seq_ad->chunk_power + 2);
×
97
  }
98

99
  my $slice;
×
100
  my $stable_id;
×
101
  my @transcripts = ();
×
102

103
  if ($by_transcript) {
×
104
    my $transcript_stable_id = $self->param('transcript_stable_id');
×
105
    $stable_id = $transcript_stable_id;
×
106
    my $ta = $core_dba->get_TranscriptAdaptor;
×
107
    my $transcript = $ta->fetch_by_stable_id($transcript_stable_id) 
×
108
      or die "failed to fetch transcript for stable id: $transcript_stable_id";
109
    $slice = $sa->fetch_by_transcript_stable_id(
×
110
      $transcript_stable_id, 
111
      $max_distance
112
    ) or die "failed to get slice around transcript: $transcript_stable_id";
113
    # call seq here to help cache
114
    $slice->seq();
×
115
    $transcript = $transcript->transfer($slice);
×
116
    push @transcripts, $transcript;
×
117
  } else {
118
    my $gene_stable_id =  $self->param('gene_stable_id');
×
119
    $stable_id = $gene_stable_id;
×
120
    my $ga = $core_dba->get_GeneAdaptor;
×
121
    my $gene = $ga->fetch_by_stable_id($gene_stable_id) 
×
122
      or die "failed to fetch gene for stable id: $gene_stable_id";
123
    $slice = $sa->fetch_by_gene_stable_id(
×
124
      $gene_stable_id, 
125
      $max_distance
126
    ) or die "failed to get slice around gene: $gene_stable_id";
127
    # call seq here to help cache
128
    $slice->seq();
×
129
    $gene = $gene->transfer($slice);
×
130
    push @transcripts, @{ $gene->get_all_Transcripts };
×
131
  }
132

133
  my $vfa = $var_dba->get_VariationFeatureAdaptor();
×
134
  my @vfs = (
×
135
    @{ $vfa->fetch_all_by_Slice_SO_terms($slice) },
×
136
    @{ $vfa->fetch_all_somatic_by_Slice_SO_terms($slice) }
×
137
  );
138
  my $web_index_files_dir = $self->get_files_dir($stable_id, 'web_index');
×
139

140
  # get a fh for the hgvs file too
141
  my $hgvs_fh = FileHandle->new();
×
142
  $hgvs_fh->open(">" . $web_index_files_dir . "/$stable_id\_variation_hgvs.txt") or die "Cannot open dump file " . $web_index_files_dir . "/$stable_id\_variation_hgvs.txt: $!";
×
143

144
  my $hgvs_by_var = {};
×
145
  my $var_count = 0;
×
146

147
  my $files = $self->get_dump_files($stable_id, $tva);
×
148

149
  my %biotypes_to_skip = (
×
150
    'lncRNA' => 1,
151
    'processed_pseudogene' => 1,
152
    'unprocessed_pseudogene' => 1,
153
  );
154

155
  for my $transcript (@transcripts) {
×
156
    
157
    my $biotype = $transcript->biotype;
×
158
    my $is_mane = $transcript->is_mane();
×
159
    my $stable_id = $transcript->stable_id;
×
160
    my @vf_ids;
×
161

162
    for my $vf(@vfs) {
×
163

164
      if (defined $variations_to_include) {
×
165
        next unless $variations_to_include->{$vf->variation_name};
×
166
      }
167

168
      next unless overlap($vf->start, $vf->end, $transcript->start - $max_distance, $transcript->end + $max_distance);
×
169

170
      my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new(
×
171
        -transcript     => $transcript,
172
        -variation_feature  => $vf,
173
        -adaptor      => $tva,
174
        -disambiguate_single_nucleotide_alleles => $disambiguate_sn_alleles,
175
        -no_transfer    => 1,
176
        -no_shift        => $prevent_shifting, 
177
      );
178

179
      # if the variation has no effect on the transcript $tv will be undef
180
      if ($tv) {#} && ( scalar(@{ $tv->consequence_type }) > 0) ) {
×
181

182
              next if (!scalar(@{ $tv->consequence_type }) && ($tv->distance_to_transcript > $max_distance));
×
183

184
        push @vf_ids, $vf->dbID();
×
185

186
        # store now or save to store later? Uncomment out the behaviour you want
187
        # save to store later uses more memory but means you don't have to sort human TV after the run
188
        
189
        ## BEHAVIOUR 1: store to DB now
190
        ## comment: slow as MySQL does INSERT DELAYED
191
        # $tva->store($tv, $mtmp);
192
        ## end block
193

194
        ## BEHAVIOUR 2: store to memory
195
        ## comment: seems like MySQL doesn't work very quickly using table locks required and also still does one INSERT per row
196
        # push @write_data, @{$tva->_get_write_data($tv)};
197
        ## end block
198
        
199
        ## BEHAVIOUR 3: store to tmp file
200
        ## comment: seems to be the fastest as it uses LOAD DATA which inherently locks anyway and does only one MySQL command per gene
201
        my $data = $tva->_get_write_data($tv);
×
202
        my $tv_fh = $files->{transcript_variation}->{fh};
×
203
        print $tv_fh join("\t", map {defined($_) ? $_ : '\N'} @$_)."\n" for @$data;
×
204

205
        if($mtmp) {
×
206

207
          my $mtmp_data = $tva->_get_mtmp_write_data_from_tv_write_data($data);
×
208
          my $mtmp_fh = $files->{MTMP_transcript_variation}->{fh};
×
209
          
210
          # Check species
211
          my $species = $self->param('species');
×
212
          # MANE is only available for human
213
          my $is_human = $species =~ /homo_sapiens|human/ ? 1 : 0;
×
214
          my $write_mtmp = ($is_human && $is_mane) || !$is_human ? 1 : 0;
×
215
          
216
          unless($biotypes_to_skip{$biotype} || !$write_mtmp){
×
217
            print $mtmp_fh join("\t", map {defined($_) ? $_ : '\N'} @$_)."\n" for @$mtmp_data;
×
218
          }
219
        }
220

221
        ## end block
222

223
      
224
        ## populate tables for website index building
225
        my $var_id = $vf->get_Variation_dbID();
×
226

227
        for my $allele (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) {
×
228

229
          next unless defined $allele->hgvs_transcript();
×
230
          my $hgvs_transcript = (split/\:/, $allele->hgvs_transcript())[1];
×
231
          $hgvs_by_var->{$var_id}->{$hgvs_transcript} = 1;
×
232
          
233
          next unless defined $allele->hgvs_protein();
×
234
          my $hgvs_protein  = (split/\:/, $allele->hgvs_protein())[1];
×
235
          $hgvs_by_var->{$var_id}->{$hgvs_protein} = 1 if defined $hgvs_protein && $hgvs_protein =~/^p/; ## don't store synonymous
×
236
        }
237

238
        # dump out these hashes periodically to stop memory exploding
239
        # will lead to more duplicates in the output file but better that than job failing
240
        if(++$var_count > 100000) {
×
241
          $self->dump_hgvs_var($hgvs_by_var, $hgvs_fh);
×
242
          $var_count = 0;
×
243
          $hgvs_by_var = {};
×
244
        }
245
      }                       
246
    }
247

248
    # Delete Updated
249
     if (-e $self->param('update_diff')){
×
250
        my @chunk_delete;
×
251

252
        push @chunk_delete, [ splice @vf_ids, 0, 500 ]  while @vf_ids;
×
253

254
        for (@chunk_delete){
×
255
          my $joined_vf_ids = join(',', @$_);
×
256
          next if $joined_vf_ids == "";
×
257
          $dbc->do(qq{
×
258
                    DELETE FROM  transcript_variation
259
                    WHERE   variation_feature_id IN ($joined_vf_ids)
260
                    AND     feature_stable_id = "$stable_id"
261
          });
262

263
          $dbc->do(qq{
×
264
                    DELETE FROM  MTMP_transcript_variation
265
                    WHERE   variation_feature_id IN ($joined_vf_ids)
266
                    AND     feature_stable_id = "$stable_id"
267
          }) if($mtmp);
268
        }
269
    }
270
  }
271

272
  ## uncomment this if using BEHAVIOUR 2 above
273
  # if(@write_data) {
274
  #   $var_dba->dbc->do("LOCK TABLES transcript_variation WRITE");
275
  #   $tva->_store_write_data(\@write_data, 1);
276
  #   $var_dba->dbc->do("UNLOCK TABLES");
277

278
  #   if($mtmp) {
279
  #     $var_dba->dbc->do("LOCK TABLES MTMP_transcript_variation WRITE");
280
  #     $tva->_store_mtmp_write_data($tva->_get_mtmp_write_data_from_tv_write_data(\@write_data), 1) if $mtmp;
281
  #     $var_dba->dbc->do("UNLOCK TABLES");
282
  #   }
283
  # }
284
  ## end block
285

286
  ## uncomment this if using BEHAVIOUR 3 above
287
  ## LOADING HAS BEEN MOVED TO FinishTranscriptEffect.pm if run in by_transcript analysis mode
288
  my $tmpdir = $self->get_files_dir($stable_id, 'transcript_effect');
×
289
  foreach my $table(keys %$files) {
×
290
    $files->{$table}->{fh}->close();
×
291
    if (!$by_transcript) {
×
292
      $ImportUtils::TMP_DIR = $tmpdir;
×
293
      $ImportUtils::TMP_FILE = $files->{$table}->{filename};
×
294
      load($var_dba->dbc, ($table, @{$files->{$table}->{cols}}));
×
295
    }
296
  }
297
  ## end block
298

299
  # dump HGVS stubs to file for web index
300
  $self->dump_hgvs_var($hgvs_by_var, $hgvs_fh);
×
301

302
  $hgvs_fh->close();
×
303

304
  print STDERR "All done\n" if $DEBUG;
×
305

306
  return;
×
307
}
308

309
sub write_output {
310
  my $self = shift;
×
311
  return;
×
312
}
313

314
sub dump_hgvs_var {
315
  my ($self, $hgvs_by_var, $fh) = @_;
×
316

317
  if($hgvs_by_var) {
×
318
    for my $var_id(keys %$hgvs_by_var) {
×
319
      print $fh "$var_id\t$_\n" for keys %{$hgvs_by_var->{$var_id}};
×
320
    }
321
  }
322
}
323

324
sub get_dump_files {
325
  my $self = shift;
×
326
  my $stable_id = shift;
×
327
  my $tva = shift;
×
328
  # initialise a hash of files
329
  my $files = {
×
330
    transcript_variation      => { 'cols' => [$tva->_write_columns],      },
331
    MTMP_transcript_variation => { 'cols' => [$tva->_mtmp_write_columns], },
332
  };
333
  my $tmpdir = $self->get_files_dir($stable_id, 'transcript_effect');
×
334
  # create file handles
335
  for my $table(keys %$files) {
×
336
    my $hash = $files->{$table};
×
337
    $hash->{filename} = sprintf('%s_%s.txt', $stable_id, $table);
×
338
    $hash->{filepath} = sprintf('%s/%s', $tmpdir, $hash->{filename});
×
339
    
340
    my $fh = FileHandle->new();
×
341
    $fh->open(">".$hash->{filepath}) or die("ERROR: Could not write to ".$hash->{filepath});
×
342
    $hash->{fh} = $fh;
×
343
  }
344
  return $files;
×
345
}
346

347

348
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc