• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl-variation / #617596958

09 Feb 2024 05:01PM UTC coverage: 83.307%. First build
#617596958

travis-ci

22667 of 27209 relevant lines covered (83.31%)

1428.52 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

6.47
/modules/Bio/EnsEMBL/Variation/Pipeline/TranscriptEffect.pm
1
=head1 LICENSE
2

3
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4
Copyright [2016-2025] EMBL-European Bioinformatics Institute
5

6
Licensed under the Apache License, Version 2.0 (the "License");
7
you may not use this file except in compliance with the License.
8
You may obtain a copy of the License at
9

10
   http://www.apache.org/licenses/LICENSE-2.0
11

12
Unless required by applicable law or agreed to in writing, software
13
distributed under the License is distributed on an "AS IS" BASIS,
14
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
See the License for the specific language governing permissions and
16
limitations under the License.
17

18
=cut
19

20

21
=head1 CONTACT
22

23
 Please email comments or questions to the public Ensembl
24
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25

26
 Questions may also be sent to the Ensembl help desk at
27
 <http://www.ensembl.org/Help/Contact>.
28

29
=cut
30

31
package Bio::EnsEMBL::Variation::Pipeline::TranscriptEffect;
32

33
use strict;
1✔
34
use warnings;
1✔
35

36
use Bio::EnsEMBL::Variation::TranscriptVariation;
1✔
37
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap);
1✔
38
use Bio::EnsEMBL::Variation::Utils::FastaSequence qw(setup_fasta);
1✔
39

40
use FileHandle;
1✔
41
use ImportUtils qw(load);
1✔
42
use Fcntl qw(:flock SEEK_END);
1✔
43

44
use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess);
1✔
45

46
my $DEBUG   = 0;
47

48
sub run {
49
  my $self = shift;
×
50

51
  my $disambiguate_sn_alleles = $self->param('disambiguate_single_nucleotide_alleles');
×
52
  my $mtmp = $self->param('mtmp_table');
×
53
  my $max_distance = $self->param('max_distance');
×
54
  my $by_transcript = ($self->param('analysis') eq 'by_transcript') ? 1 : 0;
×
55
  my $prevent_shifting = $self->param('prevent_shifting');
×
56

57

58
  my $variations_to_include;
×
59
  # if (my $vars = $self->param('variations_to_include')) {
60
  #   # turn the list of variation names into a hash to speed up checking
61
  #   $variations_to_include = { map { $_ => 1 } @$vars };
62
  # }
63

64
  # clear the registry here
65
  # this hopefully prevents any sequence caching issues
66
  # overhanging from previous jobs executed in the same hive process
67
  Bio::EnsEMBL::Registry->clear();
×
68

69
  my $core_dba = $self->get_species_adaptor('core');
×
70
  $core_dba->dbc->reconnect_when_lost(1);
×
71

72
  my $var_dba = $self->get_species_adaptor('variation');
×
73
  $var_dba->dbc->reconnect_when_lost(1);
×
74

75
  my $dbc = $var_dba->dbc();
×
76

77
  my $sa = $core_dba->get_SliceAdaptor;
×
78
  
79
  my $tva = $var_dba->get_TranscriptVariationAdaptor;
×
80
  $tva->db->include_failed_variations(1);
×
81

82
  if((my $fasta = $self->param('fasta')) && !$Bio::EnsEMBL::Slice::_fasta_redefined && !$Bio::EnsEMBL::Slice::fasta_db) {
×
83
    # we need to find the assembly version to tell it about PARs
84
    my ($highest_cs) = @{$core_dba->get_CoordSystemAdaptor->fetch_all()};
×
85
    my $assembly = $highest_cs->version();
×
86
    setup_fasta(-FASTA => $fasta, -ASSEMBLY => $assembly);
×
87
  }
88
  else {
89
    # set seq cache size higher
90
    # this prevents repeated DB lookups for sequence for HGVS e.g. in long introns
91
    # to do this we fetch a sequence adaptor
92
    my $seq_ad = $core_dba->get_SequenceAdaptor;
×
93
    # then reset its cache
94
    # the first param passed to this method is the "chunk power"
95
    # essentially the length in bp of cached sequence will be 2 ^ chunk_power
96
    $seq_ad->_init_seq_instance($seq_ad->chunk_power + 2);
×
97
  }
98

99
  my $slice;
×
100
  my $stable_id;
×
101
  my @transcripts = ();
×
102

103
  if ($by_transcript) {
×
104
    my $transcript_stable_id = $self->param('transcript_stable_id');
×
105
    $stable_id = $transcript_stable_id;
×
106
    my $ta = $core_dba->get_TranscriptAdaptor;
×
107
    my $transcript = $ta->fetch_by_stable_id($transcript_stable_id) 
×
108
      or die "failed to fetch transcript for stable id: $transcript_stable_id";
109

×
110
    $slice = $sa->fetch_by_transcript_stable_id(
111
      $transcript_stable_id, 
112
      $max_distance
113
    ) or die "failed to get slice around transcript: $transcript_stable_id";
114
    # call seq here to help cache
×
115
    $slice->seq();
×
116
    $transcript = $transcript->transfer($slice);
×
117
    push @transcripts, $transcript;
118
  } 
×
119
  
×
120
  else {
×
121
    my $gene_stable_id =  $self->param('gene_stable_id');
×
122
    $stable_id = $gene_stable_id;
123
    my $ga = $core_dba->get_GeneAdaptor;
×
124
    my $gene = $ga->fetch_by_stable_id($gene_stable_id) 
125
      or die "failed to fetch gene for stable id: $gene_stable_id";
126
    $slice = $sa->fetch_by_gene_stable_id(
127
      $gene_stable_id, 
128
      $max_distance
×
129
    ) or die "failed to get slice around gene: $gene_stable_id";
×
130
    # call seq here to help cache
×
131
    $slice->seq();
132
    $gene = $gene->transfer($slice);
133
    push @transcripts, @{ $gene->get_all_Transcripts };
×
134
  }
×
135

×
136
  my $vfa = $var_dba->get_VariationFeatureAdaptor();
×
137
  my @vfs = (
138
    @{ $vfa->fetch_all_by_Slice_SO_terms($slice) },
×
139
    @{ $vfa->fetch_all_somatic_by_Slice_SO_terms($slice) }
140
  );
141
  my $web_index_files_dir = $self->get_files_dir($stable_id, 'web_index');
×
142

×
143
  # get a fh for the hgvs file too
144
  my $hgvs_fh = FileHandle->new();
×
145
  $hgvs_fh->open(">" . $web_index_files_dir . "/$stable_id\_variation_hgvs.txt") or die "Cannot open dump file " . $web_index_files_dir . "/$stable_id\_variation_hgvs.txt: $!";
×
146

147
  my $hgvs_by_var = {};
×
148
  my $var_count = 0;
149

×
150
  my $files = $self->get_dump_files($stable_id, $tva);
151

152
  my %biotypes_to_skip = (
153
    'lncRNA' => 1,
154
    'processed_pseudogene' => 1,
155
    'unprocessed_pseudogene' => 1,
×
156
  );
157

×
158
  for my $transcript (@transcripts) {
×
159

×
160
    my $biotype = $transcript->biotype;
×
161
    my $is_mane = $transcript->is_mane(); # grch38
×
162
    my $is_canonical = $transcript->is_canonical(); # grch37
163
    my $stable_id = $transcript->stable_id;
×
164
    my @vf_ids;
165

×
166
    #Skip transcript if running in gc primary mode and transcript isn't part of that set.
×
167
    next if ($self->param('gencode_primary') and !$transcript->gencode_primary);
168

169

×
170
    for my $vf(@vfs) {
171

×
172
      if (defined $variations_to_include) {
173
        next unless $variations_to_include->{$vf->variation_name};
174
      }
175

176
      next unless overlap($vf->start, $vf->end, $transcript->start - $max_distance, $transcript->end + $max_distance);
177

178
      my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new(
179
        -transcript     => $transcript,
180
        -variation_feature  => $vf,
181
        -adaptor      => $tva,
×
182
        -disambiguate_single_nucleotide_alleles => $disambiguate_sn_alleles,
183
        -no_transfer    => 1,
×
184
        -no_shift        => $prevent_shifting, 
185
      );
×
186

187
      # if the variation has no effect on the transcript $tv will be undef
188
      if ($tv) {#} && ( scalar(@{ $tv->consequence_type }) > 0) ) {
189

190
              next if (!scalar(@{ $tv->consequence_type }) && ($tv->distance_to_transcript > $max_distance));
191

192
        push @vf_ids, $vf->dbID();
193

194
        # store now or save to store later? Uncomment out the behaviour you want
195
        # save to store later uses more memory but means you don't have to sort human TV after the run
196
        
197
        ## BEHAVIOUR 1: store to DB now
198
        ## comment: slow as MySQL does INSERT DELAYED
199
        # $tva->store($tv, $mtmp);
200
        ## end block
201

202
        ## BEHAVIOUR 2: store to memory
×
203
        ## comment: seems like MySQL doesn't work very quickly using table locks required and also still does one INSERT per row
×
204
        # push @write_data, @{$tva->_get_write_data($tv)};
×
205
        ## end block
206
        
×
207
        ## BEHAVIOUR 3: store to tmp file
208
        ## comment: seems to be the fastest as it uses LOAD DATA which inherently locks anyway and does only one MySQL command per gene
×
209
        my $data = $tva->_get_write_data($tv);
×
210
        my $tv_fh = $files->{transcript_variation}->{fh};
211
        print $tv_fh join("\t", map {defined($_) ? $_ : '\N'} @$_)."\n" for @$data;
212

×
213
        if($mtmp) {
214

×
215
          my $mtmp_data = $tva->_get_mtmp_write_data_from_tv_write_data($data);
×
216
          my $mtmp_fh = $files->{MTMP_transcript_variation}->{fh};
217
          
218
          # Check species
×
219
          my $species = $self->param('species');
×
220
          # MANE is only available for human grch38
221
          my $is_human = $species =~ /homo_sapiens|human/ ? 1 : 0;
222
          my $write_mtmp = ($is_human && $is_mane) || !$is_human ? 1 : 0;
×
223

×
224
          # Human grch37 has canonical transcripts
225
          if($is_human && lc $self->param('assembly') =~ /37|grch37/ && $is_canonical) {
226
            $write_mtmp = 1;
227
          }
228

229
          unless($biotypes_to_skip{$biotype} || !$write_mtmp){
230
            print $mtmp_fh join("\t", map {defined($_) ? $_ : '\N'} @$_)."\n" for @$mtmp_data;
231
          }
×
232
        }
233

×
234
        ## end block
235

×
236
      
×
237
        ## populate tables for website index building
×
238
        my $var_id = $vf->get_Variation_dbID();
239

×
240
        for my $allele (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) {
×
241

×
242
          next unless defined $allele->hgvs_transcript();
243
          my $hgvs_transcript = (split/\:/, $allele->hgvs_transcript())[1];
244
          $hgvs_by_var->{$var_id}->{$hgvs_transcript} = 1;
245
          
246
          next unless defined $allele->hgvs_protein();
×
247
          my $hgvs_protein  = (split/\:/, $allele->hgvs_protein())[1];
×
248
          $hgvs_by_var->{$var_id}->{$hgvs_protein} = 1 if defined $hgvs_protein && $hgvs_protein =~/^p/; ## don't store synonymous
×
249
        }
×
250

251
        # dump out these hashes periodically to stop memory exploding
252
        # will lead to more duplicates in the output file but better that than job failing
253
        if(++$var_count > 100000) {
254
          $self->dump_hgvs_var($hgvs_by_var, $hgvs_fh);
255
          $var_count = 0;
×
256
          $hgvs_by_var = {};
×
257
        }
258
      }                       
×
259
    }
260

×
261
    # Delete Updated
×
262
     if (-e $self->param('update_diff')){
×
263
        my @chunk_delete;
×
264

265
        push @chunk_delete, [ splice @vf_ids, 0, 500 ]  while @vf_ids;
266

267
        for (@chunk_delete){
268
          my $joined_vf_ids = join(',', @$_);
269
          next if $joined_vf_ids == "";
×
270
          $dbc->do(qq{
271
                    DELETE FROM  transcript_variation
272
                    WHERE   variation_feature_id IN ($joined_vf_ids)
273
                    AND     feature_stable_id = "$stable_id"
274
          });
275

276
          $dbc->do(qq{
277
                    DELETE FROM  MTMP_transcript_variation
278
                    WHERE   variation_feature_id IN ($joined_vf_ids)
279
                    AND     feature_stable_id = "$stable_id"
280
          }) if($mtmp);
281
        }
282
    }
283
  }
284

285
  ## uncomment this if using BEHAVIOUR 2 above
286
  # if(@write_data) {
287
  #   $var_dba->dbc->do("LOCK TABLES transcript_variation WRITE");
288
  #   $tva->_store_write_data(\@write_data, 1);
289
  #   $var_dba->dbc->do("UNLOCK TABLES");
290

291
  #   if($mtmp) {
292
  #     $var_dba->dbc->do("LOCK TABLES MTMP_transcript_variation WRITE");
293
  #     $tva->_store_mtmp_write_data($tva->_get_mtmp_write_data_from_tv_write_data(\@write_data), 1) if $mtmp;
294
  #     $var_dba->dbc->do("UNLOCK TABLES");
×
295
  #   }
×
296
  # }
×
297
  ## end block
×
298

×
299
  ## uncomment this if using BEHAVIOUR 3 above
×
300
  ## LOADING HAS BEEN MOVED TO FinishTranscriptEffect.pm if run in by_transcript analysis mode
×
301
  my $tmpdir = $self->get_files_dir($stable_id, 'transcript_effect');
302
  foreach my $table(keys %$files) {
303
    $files->{$table}->{fh}->close();
304
    if (!$by_transcript) {
305
      $ImportUtils::TMP_DIR = $tmpdir;
306
      $ImportUtils::TMP_FILE = $files->{$table}->{filename};
×
307
      load($var_dba->dbc, ($table, @{$files->{$table}->{cols}}));
308
    }
×
309
  }
310
  ## end block
×
311

312
  # dump HGVS stubs to file for web index
×
313
  $self->dump_hgvs_var($hgvs_by_var, $hgvs_fh);
314

315
  $hgvs_fh->close();
316

×
317
  print STDERR "All done\n" if $DEBUG;
×
318

319
  return;
320
}
321

×
322
sub write_output {
323
  my $self = shift;
×
324
  return;
×
325
}
×
326

327
sub dump_hgvs_var {
328
  my ($self, $hgvs_by_var, $fh) = @_;
329

330
  if($hgvs_by_var) {
331
    for my $var_id(keys %$hgvs_by_var) {
×
332
      print $fh "$var_id\t$_\n" for keys %{$hgvs_by_var->{$var_id}};
×
333
    }
×
334
  }
335
}
×
336

337
sub get_dump_files {
338
  my $self = shift;
339
  my $stable_id = shift;
×
340
  my $tva = shift;
341
  # initialise a hash of files
×
342
  my $files = {
×
343
    transcript_variation      => { 'cols' => [$tva->_write_columns],      },
×
344
    MTMP_transcript_variation => { 'cols' => [$tva->_mtmp_write_columns], },
×
345
  };
346
  my $tmpdir = $self->get_files_dir($stable_id, 'transcript_effect');
×
347
  # create file handles
×
348
  for my $table(keys %$files) {
×
349
    my $hash = $files->{$table};
350
    $hash->{filename} = sprintf('%s_%s.txt', $stable_id, $table);
×
351
    $hash->{filepath} = sprintf('%s/%s', $tmpdir, $hash->{filename});
352
    
353
    my $fh = FileHandle->new();
354
    $fh->open(">".$hash->{filepath}) or die("ERROR: Could not write to ".$hash->{filepath});
355
    $hash->{fh} = $fh;
356
  }
357
  return $files;
358
}
359

360

361
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc