• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl-variation / #608922749

30 Aug 2023 04:16PM UTC coverage: 83.169%. First build
#608922749

Pull #1029

travis-ci

Pull Request #1029: Fetch multiple SV by name list

30 of 30 new or added lines in 2 files covered. (100.0%)

22453 of 26997 relevant lines covered (83.17%)

1436.3 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

6.62
/modules/Bio/EnsEMBL/Variation/Pipeline/TranscriptEffect.pm
1
=head1 LICENSE
2

3
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4
Copyright [2016-2025] EMBL-European Bioinformatics Institute
5

6
Licensed under the Apache License, Version 2.0 (the "License");
7
you may not use this file except in compliance with the License.
8
You may obtain a copy of the License at
9

10
   http://www.apache.org/licenses/LICENSE-2.0
11

12
Unless required by applicable law or agreed to in writing, software
13
distributed under the License is distributed on an "AS IS" BASIS,
14
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
See the License for the specific language governing permissions and
16
limitations under the License.
17

18
=cut
19

20

21
=head1 CONTACT
22

23
 Please email comments or questions to the public Ensembl
24
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25

26
 Questions may also be sent to the Ensembl help desk at
27
 <http://www.ensembl.org/Help/Contact>.
28

29
=cut
30

31
package Bio::EnsEMBL::Variation::Pipeline::TranscriptEffect;
32

33
use strict;
1✔
34
use warnings;
1✔
35

36
use Bio::EnsEMBL::Variation::TranscriptVariation;
1✔
37
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap);
1✔
38
use Bio::EnsEMBL::Variation::Utils::FastaSequence qw(setup_fasta);
1✔
39

40
use FileHandle;
1✔
41
use ImportUtils qw(load);
1✔
42
use Fcntl qw(:flock SEEK_END);
1✔
43

44
use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess);
1✔
45

46
my $DEBUG   = 0;
47

48
sub run {
49
  my $self = shift;
×
50

51
  my $disambiguate_sn_alleles = $self->param('disambiguate_single_nucleotide_alleles');
×
52
  my $mtmp = $self->param('mtmp_table');
×
53
  my $max_distance = $self->param('max_distance');
×
54
  my $by_transcript = ($self->param('analysis') eq 'by_transcript') ? 1 : 0;
×
55
  my $prevent_shifting = $self->param('prevent_shifting');
×
56

57

58
  my $variations_to_include;
×
59
  # if (my $vars = $self->param('variations_to_include')) {
60
  #   # turn the list of variation names into a hash to speed up checking
61
  #   $variations_to_include = { map { $_ => 1 } @$vars };
62
  # }
63

64
  # clear the registry here
65
  # this hopefully prevents any sequence caching issues
66
  # overhanging from previous jobs executed in the same hive process
67
  Bio::EnsEMBL::Registry->clear();
×
68

69
  my $core_dba = $self->get_species_adaptor('core');
×
70
  $core_dba->dbc->reconnect_when_lost(1);
×
71

72
  my $var_dba = $self->get_species_adaptor('variation');
×
73
  $var_dba->dbc->reconnect_when_lost(1);
×
74

75
  my $dbc = $var_dba->dbc();
×
76

77
  my $sa = $core_dba->get_SliceAdaptor;
×
78
  
79
  my $tva = $var_dba->get_TranscriptVariationAdaptor;
×
80
  $tva->db->include_failed_variations(1);
×
81

82
  if((my $fasta = $self->param('fasta')) && !$Bio::EnsEMBL::Slice::_fasta_redefined && !$Bio::EnsEMBL::Slice::fasta_db) {
×
83
    # we need to find the assembly version to tell it about PARs
84
    my ($highest_cs) = @{$core_dba->get_CoordSystemAdaptor->fetch_all()};
×
85
    my $assembly = $highest_cs->version();
×
86
    setup_fasta(-FASTA => $fasta, -ASSEMBLY => $assembly);
×
87
  }
88
  else {
89
    # set seq cache size higher
90
    # this prevents repeated DB lookups for sequence for HGVS e.g. in long introns
91
    # to do this we fetch a sequence adaptor
92
    my $seq_ad = $core_dba->get_SequenceAdaptor;
×
93
    # then reset its cache
94
    # the first param passed to this method is the "chunk power"
95
    # essentially the length in bp of cached sequence will be 2 ^ chunk_power
96
    $seq_ad->_init_seq_instance($seq_ad->chunk_power + 2);
×
97
  }
98

99
  my $slice;
×
100
  my $stable_id;
×
101
  my @transcripts = ();
×
102

103
  if ($by_transcript) {
×
104
    my $transcript_stable_id = $self->param('transcript_stable_id');
×
105
    $stable_id = $transcript_stable_id;
×
106
    my $ta = $core_dba->get_TranscriptAdaptor;
×
107
    my $transcript = $ta->fetch_by_stable_id($transcript_stable_id) 
×
108
      or die "failed to fetch transcript for stable id: $transcript_stable_id";
109

×
110
    $slice = $sa->fetch_by_transcript_stable_id(
111
      $transcript_stable_id, 
112
      $max_distance
113
    ) or die "failed to get slice around transcript: $transcript_stable_id";
114
    # call seq here to help cache
×
115
    $slice->seq();
×
116
    $transcript = $transcript->transfer($slice);
×
117
    push @transcripts, $transcript;
118
  } 
×
119
  
×
120
  else {
×
121
    my $gene_stable_id =  $self->param('gene_stable_id');
×
122
    $stable_id = $gene_stable_id;
123
    my $ga = $core_dba->get_GeneAdaptor;
×
124
    my $gene = $ga->fetch_by_stable_id($gene_stable_id) 
125
      or die "failed to fetch gene for stable id: $gene_stable_id";
126
    $slice = $sa->fetch_by_gene_stable_id(
127
      $gene_stable_id, 
128
      $max_distance
×
129
    ) or die "failed to get slice around gene: $gene_stable_id";
×
130
    # call seq here to help cache
×
131
    $slice->seq();
132
    $gene = $gene->transfer($slice);
133
    push @transcripts, @{ $gene->get_all_Transcripts };
×
134
  }
×
135

×
136
  my $vfa = $var_dba->get_VariationFeatureAdaptor();
×
137
  my @vfs = (
138
    @{ $vfa->fetch_all_by_Slice_SO_terms($slice) },
×
139
    @{ $vfa->fetch_all_somatic_by_Slice_SO_terms($slice) }
140
  );
141
  my $web_index_files_dir = $self->get_files_dir($stable_id, 'web_index');
×
142

×
143
  # get a fh for the hgvs file too
144
  my $hgvs_fh = FileHandle->new();
×
145
  $hgvs_fh->open(">" . $web_index_files_dir . "/$stable_id\_variation_hgvs.txt") or die "Cannot open dump file " . $web_index_files_dir . "/$stable_id\_variation_hgvs.txt: $!";
×
146

147
  my $hgvs_by_var = {};
×
148
  my $var_count = 0;
149

×
150
  my $files = $self->get_dump_files($stable_id, $tva);
151

152
  my %biotypes_to_skip = (
153
    'lncRNA' => 1,
154
    'processed_pseudogene' => 1,
155
    'unprocessed_pseudogene' => 1,
×
156
  );
157

×
158
  for my $transcript (@transcripts) {
×
159

×
160
    my $biotype = $transcript->biotype;
×
161
    my $is_mane = $transcript->is_mane(); # grch38
162
    my $is_canonical = $transcript->is_canonical(); # grch37
×
163
    my $stable_id = $transcript->stable_id;
164
    my @vf_ids;
×
165

×
166
    #Skip transcript if running in gc primary mode and transcript isn't part of that set.
167
    next if ($self->param('gencode_primary') and !$transcript->gencode_primary);
168

×
169

170
    for my $vf(@vfs) {
×
171

172
      if (defined $variations_to_include) {
173
        next unless $variations_to_include->{$vf->variation_name};
174
      }
175

176
      next unless overlap($vf->start, $vf->end, $transcript->start - $max_distance, $transcript->end + $max_distance);
177

178
      my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new(
179
        -transcript     => $transcript,
180
        -variation_feature  => $vf,
×
181
        -adaptor      => $tva,
182
        -disambiguate_single_nucleotide_alleles => $disambiguate_sn_alleles,
×
183
        -no_transfer    => 1,
184
        -no_shift        => $prevent_shifting, 
×
185
      );
186

187
      # if the variation has no effect on the transcript $tv will be undef
188
      if ($tv) {#} && ( scalar(@{ $tv->consequence_type }) > 0) ) {
189

190
              next if (!scalar(@{ $tv->consequence_type }) && ($tv->distance_to_transcript > $max_distance));
191

192
        push @vf_ids, $vf->dbID();
193

194
        # store now or save to store later? Uncomment out the behaviour you want
195
        # save to store later uses more memory but means you don't have to sort human TV after the run
196
        
197
        ## BEHAVIOUR 1: store to DB now
198
        ## comment: slow as MySQL does INSERT DELAYED
199
        # $tva->store($tv, $mtmp);
200
        ## end block
201

×
202
        ## BEHAVIOUR 2: store to memory
×
203
        ## comment: seems like MySQL doesn't work very quickly using table locks required and also still does one INSERT per row
×
204
        # push @write_data, @{$tva->_get_write_data($tv)};
205
        ## end block
×
206
        
207
        ## BEHAVIOUR 3: store to tmp file
×
208
        ## comment: seems to be the fastest as it uses LOAD DATA which inherently locks anyway and does only one MySQL command per gene
×
209
        my $data = $tva->_get_write_data($tv);
210
        my $tv_fh = $files->{transcript_variation}->{fh};
211
        print $tv_fh join("\t", map {defined($_) ? $_ : '\N'} @$_)."\n" for @$data;
×
212

213
        if($mtmp) {
×
214

×
215
          my $mtmp_data = $tva->_get_mtmp_write_data_from_tv_write_data($data);
216
          my $mtmp_fh = $files->{MTMP_transcript_variation}->{fh};
×
217
          
×
218
          # Check species
219
          my $species = $self->param('species');
220
          # MANE is only available for human grch38
221
          my $is_human = $species =~ /homo_sapiens|human/ ? 1 : 0;
222
          my $write_mtmp = ($is_human && $is_mane) || !$is_human ? 1 : 0;
223

224
          # Human grch37 has canonical transcripts
225
          if($is_human && lc $self->param('assembly') =~ /37|grch37/ && $is_canonical) {
×
226
            $write_mtmp = 1;
227
          }
×
228

229
          unless($biotypes_to_skip{$biotype} || !$write_mtmp){
×
230
            print $mtmp_fh join("\t", map {defined($_) ? $_ : '\N'} @$_)."\n" for @$mtmp_data;
×
231
          }
×
232
        }
233

×
234
        ## end block
×
235

×
236
      
237
        ## populate tables for website index building
238
        my $var_id = $vf->get_Variation_dbID();
239

240
        for my $allele (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) {
×
241

×
242
          next unless defined $allele->hgvs_transcript();
×
243
          my $hgvs_transcript = (split/\:/, $allele->hgvs_transcript())[1];
×
244
          $hgvs_by_var->{$var_id}->{$hgvs_transcript} = 1;
245
          
246
          next unless defined $allele->hgvs_protein();
247
          my $hgvs_protein  = (split/\:/, $allele->hgvs_protein())[1];
248
          $hgvs_by_var->{$var_id}->{$hgvs_protein} = 1 if defined $hgvs_protein && $hgvs_protein =~/^p/; ## don't store synonymous
249
        }
×
250

×
251
        # dump out these hashes periodically to stop memory exploding
252
        # will lead to more duplicates in the output file but better that than job failing
×
253
        if(++$var_count > 100000) {
254
          $self->dump_hgvs_var($hgvs_by_var, $hgvs_fh);
×
255
          $var_count = 0;
×
256
          $hgvs_by_var = {};
×
257
        }
×
258
      }                       
259
    }
260

261
    # Delete Updated
262
     if (-e $self->param('update_diff')){
263
        my @chunk_delete;
×
264

265
        push @chunk_delete, [ splice @vf_ids, 0, 500 ]  while @vf_ids;
266

267
        for (@chunk_delete){
268
          my $joined_vf_ids = join(',', @$_);
269
          next if $joined_vf_ids == "";
270
          $dbc->do(qq{
271
                    DELETE FROM  transcript_variation
272
                    WHERE   variation_feature_id IN ($joined_vf_ids)
273
                    AND     feature_stable_id = "$stable_id"
274
          });
275

276
          $dbc->do(qq{
277
                    DELETE FROM  MTMP_transcript_variation
278
                    WHERE   variation_feature_id IN ($joined_vf_ids)
279
                    AND     feature_stable_id = "$stable_id"
280
          }) if($mtmp);
281
        }
282
    }
283
  }
284

285
  ## uncomment this if using BEHAVIOUR 2 above
286
  # if(@write_data) {
287
  #   $var_dba->dbc->do("LOCK TABLES transcript_variation WRITE");
288
  #   $tva->_store_write_data(\@write_data, 1);
×
289
  #   $var_dba->dbc->do("UNLOCK TABLES");
×
290

×
291
  #   if($mtmp) {
×
292
  #     $var_dba->dbc->do("LOCK TABLES MTMP_transcript_variation WRITE");
×
293
  #     $tva->_store_mtmp_write_data($tva->_get_mtmp_write_data_from_tv_write_data(\@write_data), 1) if $mtmp;
×
294
  #     $var_dba->dbc->do("UNLOCK TABLES");
×
295
  #   }
296
  # }
297
  ## end block
298

299
  ## uncomment this if using BEHAVIOUR 3 above
300
  ## LOADING HAS BEEN MOVED TO FinishTranscriptEffect.pm if run in by_transcript analysis mode
×
301
  my $tmpdir = $self->get_files_dir($stable_id, 'transcript_effect');
302
  foreach my $table(keys %$files) {
×
303
    $files->{$table}->{fh}->close();
304
    if (!$by_transcript) {
×
305
      $ImportUtils::TMP_DIR = $tmpdir;
306
      $ImportUtils::TMP_FILE = $files->{$table}->{filename};
×
307
      load($var_dba->dbc, ($table, @{$files->{$table}->{cols}}));
308
    }
309
  }
310
  ## end block
×
311

×
312
  # dump HGVS stubs to file for web index
313
  $self->dump_hgvs_var($hgvs_by_var, $hgvs_fh);
314

315
  $hgvs_fh->close();
×
316

317
  print STDERR "All done\n" if $DEBUG;
×
318

×
319
  return;
×
320
}
321

322
sub write_output {
323
  my $self = shift;
324
  return;
325
}
×
326

×
327
sub dump_hgvs_var {
×
328
  my ($self, $hgvs_by_var, $fh) = @_;
329

×
330
  if($hgvs_by_var) {
331
    for my $var_id(keys %$hgvs_by_var) {
332
      print $fh "$var_id\t$_\n" for keys %{$hgvs_by_var->{$var_id}};
333
    }
×
334
  }
335
}
×
336

×
337
sub get_dump_files {
×
338
  my $self = shift;
×
339
  my $stable_id = shift;
340
  my $tva = shift;
×
341
  # initialise a hash of files
×
342
  my $files = {
×
343
    transcript_variation      => { 'cols' => [$tva->_write_columns],      },
344
    MTMP_transcript_variation => { 'cols' => [$tva->_mtmp_write_columns], },
×
345
  };
346
  my $tmpdir = $self->get_files_dir($stable_id, 'transcript_effect');
347
  # create file handles
348
  for my $table(keys %$files) {
349
    my $hash = $files->{$table};
350
    $hash->{filename} = sprintf('%s_%s.txt', $stable_id, $table);
351
    $hash->{filepath} = sprintf('%s/%s', $tmpdir, $hash->{filename});
352
    
353
    my $fh = FileHandle->new();
354
    $fh->open(">".$hash->{filepath}) or die("ERROR: Could not write to ".$hash->{filepath});
355
    $hash->{fh} = $fh;
356
  }
357
  return $files;
358
}
359

360

361
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc