• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl-vep / #610835618

02 Oct 2023 09:04AM UTC coverage: 98.187%. First build
#610835618

Pull #1470

travis-ci

Pull Request #1470: Custom annotations: summary statistics + filter by number of records

139 of 139 new or added lines in 9 files covered. (100.0%)

11320 of 11529 relevant lines covered (98.19%)

64059.97 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.6
/modules/Bio/EnsEMBL/VEP/OutputFactory.pm
1
=head1 LICENSE
2

3
Copyright [2016-2025] EMBL-European Bioinformatics Institute
4

5
Licensed under the Apache License, Version 2.0 (the "License");
6
you may not use this file except in compliance with the License.
7
You may obtain a copy self the License at
8

9
   http://www.apache.org/licenses/LICENSE-2.0
10

11
Unless required by applicable law or agreed to in writing, sselftware
12
distributed under the License is distributed on an "AS IS" BASIS,
13
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
See the License for the specific language governing permissions and
15
limitations under the License.
16

17
=cut
18

19

20
=head1 CONTACT
21

22
 Please email comments or questions to the public Ensembl
23
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
24

25
 Questions may also be sent to the Ensembl help desk at
26
 <http://www.ensembl.org/Help/Contact>.
27

28
=cut
29

30
# EnsEMBL module for Bio::EnsEMBL::VEP::OutputFactory
31
#
32
#
33

34
=head1 NAME
35

36
Bio::EnsEMBL::VEP::OutputFactory - base class for generating VEP output
37

38
=head1 SYNOPSIS
39

40
# actually gets a Bio::EnsEMBL::VEP::OutputFactory::VCF
41
my $of = Bio::EnsEMBL::VEP::OutputFactory->new({
42
  config => $config,
43
  format => 'vcf'
44
});
45

46
# print headers
47
print "$_\n" for @{$of->headers};
48

49
# print output
50
print "$_\n" for @{$of->get_all_lines_by_InputBuffer($ib)};
51

52
=head1 DESCRIPTION
53

54
The OutputFactory class is a base class used to generate VEP output.
55

56
It should not be invoked directly, but contains the bulk of the methods
57
used to transform the Ensembl API objects into "flat" structures suitable
58
for writing to output.
59

60
=head1 METHODS
61

62
=cut
63

64

65
use strict;
100✔
66
use warnings;
100✔
67

68
package Bio::EnsEMBL::VEP::OutputFactory;
69

70
use base qw(Bio::EnsEMBL::VEP::BaseVEP);
100✔
71

72
use File::Basename;
100✔
73
use Scalar::Util qw(looks_like_number);
100✔
74
use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
100✔
75
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
100✔
76
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
100✔
77
use Bio::EnsEMBL::Variation::Utils::Constants;
100✔
78
use Bio::EnsEMBL::Variation::Utils::Sequence qw(ga4gh_vrs_from_spdi);
100✔
79
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
100✔
80
use Bio::EnsEMBL::VEP::Utils qw(format_coords merge_arrays get_flatten);
100✔
81
use Bio::EnsEMBL::VEP::Constants;
100✔
82

83
use Bio::EnsEMBL::VEP::OutputFactory::VEP_output;
100✔
84
use Bio::EnsEMBL::VEP::OutputFactory::VCF;
100✔
85
use Bio::EnsEMBL::VEP::OutputFactory::Tab;
100✔
86

87
our $CAN_USE_JSON;
88

89
BEGIN {
90
  if(eval q{ use Bio::EnsEMBL::VEP::OutputFactory::JSON; 1 }) {
100✔
91
    $CAN_USE_JSON = 1;
100✔
92
  }
93
}
94

95
my %SO_RANKS = map {$_->SO_term => $_->rank} values %Bio::EnsEMBL::Variation::Utils::Constants::OVERLAP_CONSEQUENCES;
96

97
my %FORMAT_MAP = (
98
  'vcf'     => 'VCF',
99
  'ensembl' => 'VEP_output',
100
  'vep'     => 'VEP_output',
101
  'tab'     => 'Tab',
102
  'json'    => 'JSON',
103
);
104

105
my %DISTANCE_CONS = (upstream_gene_variant => 1, downstream_gene_variant => 1);
106

107
my %FREQUENCY_KEYS = (
108
  af           => ['AF'],
109
  af_1kg       => [qw(AF AFR AMR ASN EAS EUR SAS)],
110
  af_gnomad    => [('gnomADe', map {'gnomADe_'.$_} qw(AFR AMR ASJ EAS FIN MID NFE REMAINING SAS))],
111
  af_gnomade    => [('gnomADe', map {'gnomADe_'.$_} qw(AFR AMR ASJ EAS FIN MID NFE REMAINING SAS))],
112
  af_gnomadg => [('gnomADg', map {'gnomADg_'.$_} qw(AFR AMI AMR ASJ EAS FIN MID NFE REMAINING SAS))],
113
);
114

115

116
=head2 new
117

118
  Arg 1      : hashref $args
119
               {
120
                 config => Bio::EnsEMBL::VEP::Config,
121
                 format => string (vcf, ensembl, vep, tab, json)
122
               }
123
  Example    : $of = Bio::EnsEMBL::VEP::OutputFactory({
124
                 config => $config,
125
                 format => 'tab'
126
               });
127
  Description: Create a new Bio::EnsEMBL::VEP::OutputFactory object.
128
               Will return the sub-class determined by the format arg.
129
  Returntype : Bio::EnsEMBL::VEP::OutputFactory
130
  Exceptions : throws on invalid format or if JSON format
131
               requested and JSON module not installed
132
  Caller     : Runner
133
  Status     : Stable
134

135
=cut
136

137
sub new {
138
  my $caller = shift;
1,700✔
139
  my $class = ref($caller) || $caller;
1,700✔
140
  
141
  my $self = $class->SUPER::new(@_);
1,700✔
142

143
  # add shortcuts to these params
144
  $self->add_shortcuts([qw(
1,700✔
145
    no_stats
146
    no_intergenic
147
    process_ref_homs
148
    coding_only
149
    terms
150
    output_format
151
    no_escape
152
    pick_order
153
    allele_number
154
    show_ref_allele
155
    use_transcript_ref
156
    vcf_string
157

158
    most_severe
159
    summary
160
    per_gene
161
    pick
162
    pick_allele
163
    pick_allele_gene
164
    flag_pick
165
    flag_pick_allele
166
    flag_pick_allele_gene
167
    
168
    variant_class
169
    af
170
    af_1kg
171
    af_gnomad
172
    af_gnomade
173
    af_gnomadg
174
    max_af
175
    pubmed
176
    clin_sig_allele
177
    clinvar_somatic_classification
178

179
    numbers
180
    domains
181
    symbol
182
    ccds
183
    xref_refseq
184
    refseq
185
    merged
186
    protein
187
    uniprot
188
    canonical
189
    biotype
190
    mane
191
    mane_select
192
    mane_plus_clinical
193
    gencode_primary
194
    flag_gencode_primary
195
    tsl
196
    appris
197
    transcript_version
198
    gene_version
199
    gene_phenotype
200
    mirna
201
    ambiguity
202
    var_synonyms
203

204
    total_length
205
    hgvsc
206
    hgvsp
207
    hgvsg
208
    hgvsg_use_accession
209
    hgvsp_use_prediction
210
    spdi
211
    sift
212
    ga4gh_vrs
213
    polyphen
214
    polyphen_analysis
215

216
    cell_type
217
    shift_3prime
218
    shift_genomic
1,700✔
219
    remove_hgvsp_version
220
  )]);
1,700✔
221

222
  my $hashref = $_[0];
764✔
223

224
  if(my $format = $hashref->{format}) {
764✔
225

764✔
226
    delete $hashref->{format};
227

228
    $format = lc($format);
764✔
229
    throw("ERROR: Unknown or unsupported output format $format\n") unless $FORMAT_MAP{$format};
60✔
230

231

232
    if($FORMAT_MAP{$format} eq 'JSON') {
764✔
233
      throw("ERROR: Cannot use format $format without JSON module installed\n") unless $CAN_USE_JSON;
764✔
234
    }
235

236
    my $class = 'Bio::EnsEMBL::VEP::OutputFactory::'.$FORMAT_MAP{$format};
936✔
237
    return $class->new({%$hashref, config => $self->config});
238
  }
936✔
239

240
  $self->{header_info} = $hashref->{header_info} if $hashref->{header_info};
936✔
241

242
  $self->{plugins} = $hashref->{plugins} if $hashref->{plugins};;
243

244
  return $self;
245
}
246

247

248
### Top level methods
249
#####################
250

251

252
=head2 get_all_lines_by_InputBuffer
253

254
  Arg 1      : Bio::EnsEMBL::VEP::InputBuffer $ib
255
  Example    : $lines = $of->get_all_lines_by_InputBuffer($ib);
256
  Description: Gets all lines (strings suitable for writing to output) given
257
               an annotated input buffer.
258
  Returntype : arrayref of strings
259
  Exceptions : none
260
  Caller     : Runner
261
  Status     : Stable
262

712✔
263
=cut
712✔
264

265
sub get_all_lines_by_InputBuffer {
266
  my $self = shift;
712✔
267
  my $buffer = shift;
268

269
  # output_hash_to_line will be implemented in the child class as its behaviour varies by output type
270
  return [map {$self->output_hash_to_line($_)} @{$self->get_all_output_hashes_by_InputBuffer($buffer)}];
271
}
272

273

274
=head2 get_all_output_hashes_by_InputBuffer
275

276
  Arg 1      : Bio::EnsEMBL::VEP::InputBuffer $ib
277
  Example    : $hashes = $of->get_all_output_hashes_by_InputBuffer($ib);
278
  Description: Gets all hashrefs of data (suitable for serialised writing e.g. JSON) given
279
               an annotated input buffer.
280
  Returntype : arrayref of hashrefs
281
  Exceptions : none
282
  Caller     : Runner
283
  Status     : Stable
284

684✔
285
=cut
684✔
286

287
sub get_all_output_hashes_by_InputBuffer {
288
  my $self = shift;
289
  my $buffer = shift;
684✔
290

291
  # rejoin variants that have been split
292
  # this can happen when using --minimal
11,284✔
293
  $self->rejoin_variants_in_InputBuffer($buffer) if $buffer->rejoin_required;
684✔
294

295

296
  map {@{$self->reset_shifted_positions($_)}}
11,284✔
297
    @{$buffer->buffer};
684✔
298
  
299
  return [
300
    map {@{$self->get_all_output_hashes_by_VariationFeature($_)}}
301
    @{$buffer->buffer}
302
  ];
16,244✔
303
}
16,244✔
304

16,244✔
305
sub reset_shifted_positions {
16,236✔
306
  my $self = shift;
16,236✔
307
  my $vf = shift;
308
  return [] if ref($vf) eq 'Bio::EnsEMBL::Variation::StructuralVariationFeature';
173,084✔
309
  my @tvs = $vf->get_all_TranscriptVariations();
86,560✔
310
  foreach my $tv (@{$tvs[0]})
311
  {
312
      map { bless $_, 'Bio::EnsEMBL::Variation::TranscriptVariationAllele' } 
313
          @{ $tv->get_all_BaseVariationFeatureOverlapAlleles };
173,084✔
314
        
86,560✔
315
      ## Obtains all relevant $tva objects and removes shifted positions from 
316
      ## CDS CDNA and Protein positions   
317
      map { $_->clear_shifting_variables} 
16,236✔
318
          @{ $tv->get_all_BaseVariationFeatureOverlapAlleles };
319
  }
320

321
  return \@tvs;
322
}
323

324

325

326
=head2 get_all_output_hashes_by_VariationFeature
327

328
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature $vf
329
  Example    : $hashes = $of->get_all_output_hashes_by_InputBuffer($vf);
330
  Description: This wrapper method does the following:
331
                1) fetches all the VariationFeatureOverlapAllele objects
332
                2) filters them if any filtering options are applied
333
                3) converts them into a flat hashref with the relevant output data
334
  Returntype : arrayref of hashrefs
335
  Exceptions : none
336
  Caller     : get_all_output_hashes_by_InputBuffer()
337
  Status     : Stable
338

14,056✔
339
=cut
14,056✔
340

341
sub get_all_output_hashes_by_VariationFeature {
342
  my $self = shift;
14,056✔
343
  my $vf = shift;
344

14,056✔
345
  # get the basic initial hash; this basically contains the location and ID
346
  my $hash = $self->VariationFeature_to_output_hash($vf);
347

348
  return $self->get_all_VariationFeatureOverlapAllele_output_hashes($vf, $hash);
349
}
350

351

352
=head2 get_all_VariationFeatureOverlapAllele_output_hashes
353

354
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature $vf
355
  Arg 2      : hashref of data obtained from VariationFeature
356
  Example    : $hashes = $of->get_all_VariationFeatureOverlapAllele_output_hashes($vf, $hash);
357
  Description: Takes the initial hashref of data obtained from just
358
               the VariationFeature (i.e. location-based data only)
359
               and then creates a hashref expanding on this for each
360
               VariationFeatureOverlapAllele (e.g. TranscriptVariationAllele)
361
  Returntype : arrayref of hashrefs
362
  Exceptions : none
363
  Caller     : get_all_output_hashes_by_VariationFeature()
364
  Status     : Stable
365

16,312✔
366
=cut
16,312✔
367

16,312✔
368
sub get_all_VariationFeatureOverlapAllele_output_hashes {
369
  my $self = shift;
16,312✔
370
  my $vf = shift;
371
  my $hash = shift;
372

16,312✔
373
  my @return;
374

375
  # we want to handle StructuralVariationFeatures differently
376
  my $method =  sprintf(
377
    'get_all_%sOverlapAlleles',
378
    ref($vf) eq 'Bio::EnsEMBL::Variation::StructuralVariationFeature'
379
      ? 'StructuralVariation'
16,312✔
380
      : 'VariationFeature'
381
  );
382

16,312✔
383
  my $vfoas = $self->$method($vf);
384

16,312✔
385
  # summary, most_severe don't need most of the downstream logic from hereon
386
  return $self->summary_only($vf, $hash, $vfoas) if $self->{summary} || $self->{most_severe};
387

88,604✔
388
  foreach my $vfoa(@$vfoas) {
389
    # copy the initial VF-based hash so we're not overwriting
390
    my %copy = %$hash;
88,604✔
391

88,604✔
392
    # we have a method defined for each sub-class of VariationFeatureOverlapAllele
393
    my $method = (split('::', ref($vfoa)))[-1].'_to_output_hash';
394
    my $output = $self->$method($vfoa, \%copy, $vf);
88,604✔
395

396
    # run plugins
397
    $output = $self->run_plugins($vfoa, $output, $vf);
88,604✔
398

399
    # log stats
88,604✔
400
    $self->stats->log_VariationFeatureOverlapAllele($vfoa, $output) unless $self->{no_stats};
401

402
    push @return, $output if $output;
16,312✔
403
  }
404

405
  return \@return;
406
}
407

408

409
=head2 summary_only
410

411
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature $vf
412
  Arg 2      : hashref of data obtained from VariationFeature
413
  Arg 3      : arrayref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
414
  Example    : $hashes = $of->summary_only($vf, $hash, $vfoas);
415
  Description: Squashes data from a listref of VariationFeatureOverlapAlleles
416
               into one hashref, using either a summary (list all consequence types)
417
               or most severe method.
418
  Returntype : arrayref of one hashref (for compliance with other methods)
419
  Exceptions : none
420
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
421
  Status     : Stable
422

423
=cut
8✔
424

425
sub summary_only {
8✔
426
  my ($self, $vf, $hash, $vfoas) = @_;
427

8✔
428
  my $term_method = $self->{terms}.'_term';
429

8✔
430
  my @ocs = sort {$a->rank <=> $b->rank} map {@{$_->get_all_OverlapConsequences}} @$vfoas;
431

432
  if(@ocs) {
8✔
433

4✔
434
    # summary is just all unique consequence terms
4✔
435
    if($self->{summary}) {
12✔
436
      my (@cons, %seen);
12✔
437
      foreach my $con(map {$_->$term_method || $_->SO_term} @ocs) {
438
        push @cons, $con unless $seen{$con};
4✔
439
        $seen{$con} = 1;
440
      }
441
      $hash->{Consequence} = \@cons;
442
    }
443

4✔
444
    # most severe is the consequence term with the lowest rank
445
    else {
446
      $hash->{Consequence} = [$ocs[0]->$term_method || $ocs[0]->SO_term];
447
    }
448

449
    # unless(defined($config->{no_stats})) {
450
    #   $config->{stats}->{consequences}->{$_}++ for split(',', $hash->{Consequence});
451
    # }
×
452
  }
453
  else {
454
    $self->warning_msg("Unable to assign consequence type");
8✔
455
  }
456

457
  return [$hash];
458
}
459

460

461
=head2 get_all_VariationFeatureOverlapAlleles
462

463
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature $vf
464
  Example    : $vfoas = $of->get_all_VariationFeatureOverlapAlleles($vf);
465
  Description: Gets all VariationFeatureOverlapAllele objects for given
466
               VariationFeature, applying any filtering as configured.
467
               This might be restricting to coding only, or filtering using
468
               one of the pick* functions.
469
  Returntype : arrayref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
470
  Exceptions : none
471
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
472
  Status     : Stable
473

474
=cut
16,472✔
475

16,472✔
476
sub get_all_VariationFeatureOverlapAlleles {
477
  my $self = shift;
478
  my $vf = shift;
16,472✔
479

480
  # no intergenic?
481
  return [] if $self->{no_intergenic} && defined($vf->{intergenic_variation});
482

16,468✔
483
  # get all VFOAs
484
  # need to be sensitive to whether --coding_only is switched on
485
  my $vfos = $vf->get_all_VariationFeatureOverlaps;
486

16,468✔
487
  # if coding only filter TranscriptVariations down to coding ones
4✔
488
  # leave others intact, otherwise doesn't make sense to do --regulatory and --coding_only
489
  if($self->{coding_only}) {
4✔
490
    my @new;
12✔
491

12✔
492
    foreach my $vfo(@$vfos) {
493
      if($vfo->isa('Bio::EnsEMBL::Variation::BaseVariationFeatureOverlap')) {
494
        push @new, $vfo if $vfo->can('affects_cds') && $vfo->affects_cds;
×
495
      }
496
      else {
497
        push @new, $vfo;
498
      }
4✔
499
    }
500

501
    $vfos = \@new;
502
  }
16,468✔
503

4✔
504
  # the option individual_zyg has to run a specific method if there is only one individual (unique_ind)
505
  if($vf->{unique_ind}) {
506
    return $self->filter_VariationFeatureOverlapAlleles([map {$_->get_reference_VariationFeatureOverlapAllele} @{$vfos}]);
507
  }
16,464✔
508
  else {
16,464✔
509
    # method name stub for getting *VariationAlleles
510
    my $allele_method = $self->{process_ref_homs} ? 'get_all_' : 'get_all_alternate_';  
16,464✔
511
    my $method = $allele_method.'VariationFeatureOverlapAlleles';
512

513
    return $self->filter_VariationFeatureOverlapAlleles([map {@{$_->$method}} @{$vfos}]);
514
  }
515
}
516

517

518
=head2 get_all_StructuralVariationOverlapAlleles
519

520
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature $vf
521
  Example    : $svos = $of->get_all_StructuralVariationOverlapAlleles($vf);
522
  Description: Gets all StructuralVariationOverlap objects for given
523
               StructuralVariation, applying any filtering as configured.
524
               This might be restricting to coding only, or filtering using
525
               one of the pick* functions.
526
  Returntype : arrayref of Bio::EnsEMBL::Variation::StructuralVariationOverlap
527
  Exceptions : none
528
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
529
  Status     : Stable
530

531
=cut
60✔
532

60✔
533
sub get_all_StructuralVariationOverlapAlleles {
534
  my $self = shift;
535
  my $svf = shift;
60✔
536

60✔
537
  # no intergenic?
538
  my $isv = $svf->get_IntergenicStructuralVariation(1);
539
  return [] if $self->{no_intergenic} && $isv;
540

56✔
541
  # get all VFOAs
542
  # need to be sensitive to whether --coding_only is switched on
543
  my $vfos;
56✔
544

16✔
545
  # if coding only just get transcript & intergenic ones
4✔
546
  if($self->{coding_only}) {
547
    @$vfos = grep {defined($_)} (
548
      @{$svf->get_all_TranscriptStructuralVariations},
549
      $isv
550
    );
52✔
551
  }
552
  else {
553
    $vfos = $svf->get_all_StructuralVariationOverlaps;
554
  }
56✔
555

556
  # grep out non-coding?
56✔
557
  @$vfos = grep {$_->can('affects_cds') && $_->affects_cds} @$vfos if $self->{coding_only};
558

559
  return $self->filter_StructuralVariationOverlapAlleles([map {@{$_->get_all_StructuralVariationOverlapAlleles}} @{$vfos}]);
560
}
561

562

563
=head2 filter_VariationFeatureOverlapAlleles
564

565
  Arg 1      : arrayref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
566
  Example    : $filtered = $of->filter_VariationFeatureOverlapAlleles($vfoas);
567
  Description: Filters VariationFeatureOverlapAllele objects by various criteria,
568
               choosing one per variant, allele, gene. It can also flag instead
569
               of filtering.
570
  Returntype : arrayref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
571
  Exceptions : none
572
  Caller     : get_all_VariationFeatureOverlapAlleles()
573
  Status     : Stable
574

575
=cut
16,516✔
576

16,516✔
577
sub filter_VariationFeatureOverlapAlleles {
578
  my $self = shift;
16,516✔
579
  my $vfoas = shift;
580

581
  return [] unless $vfoas && scalar @$vfoas;
16,496✔
582

168✔
583
  # pick worst?
584
  if($self->{pick}) {
585
    return [$self->pick_worst_VariationFeatureOverlapAllele($vfoas)];
586
  }
587

20✔
588
  # pick worst per allele?
20✔
589
  elsif($self->{pick_allele}) {
20✔
590
    my %by_allele;
591
    push @{$by_allele{$_->variation_feature_seq}}, $_ for @$vfoas;
592
    return [map {$self->pick_worst_VariationFeatureOverlapAllele($by_allele{$_})} keys %by_allele];
593
  }
594

4✔
595
  # pick per gene?
596
  elsif($self->{per_gene}) {
597
    return $self->pick_VariationFeatureOverlapAllele_per_gene($vfoas);
598
  }
599

4✔
600
  # pick worst per allele and gene?
4✔
601
  elsif($self->{pick_allele_gene}) {
4✔
602
    my %by_allele;
603
    push @{$by_allele{$_->variation_feature_seq}}, $_ for @$vfoas;
604
    return [map {@{$self->pick_VariationFeatureOverlapAllele_per_gene($by_allele{$_})}} keys %by_allele];
605
  }
606

12✔
607
  # flag picked?
12✔
608
  elsif($self->{flag_pick}) {
609
    if(my $worst = $self->pick_worst_VariationFeatureOverlapAllele($vfoas)) {
610
      $worst->{PICK} = 1;
611
    }
612
  }
613

8✔
614
  # flag worst per allele?
8✔
615
  elsif($self->{flag_pick_allele}) {
8✔
616
    my %by_allele;
617
    push @{$by_allele{$_->variation_feature_seq}}, $_ for @$vfoas;
618
    $self->pick_worst_VariationFeatureOverlapAllele($by_allele{$_})->{PICK} = 1 for keys %by_allele;
619
  }
620

8✔
621
  # flag worst per allele and gene?
8✔
622
  elsif($self->{flag_pick_allele_gene}) {
8✔
623
    my %by_allele;
624
    push @{$by_allele{$_->variation_feature_seq}}, $_ for @$vfoas;
625
    map {$_->{PICK} = 1} map {@{$self->pick_VariationFeatureOverlapAllele_per_gene($by_allele{$_})}} keys %by_allele;
16,300✔
626
  }
627

628
  return $vfoas;
629
}
630

631

632
=head2 filter_StructuralVariationOverlapAlleles
633

634
  Arg 1      : arrayref of Bio::EnsEMBL::Variation::StructuralVariationOverlapAlleles
635
  Example    : $filtered = $of->filter_StructuralVariationOverlapAlleles($svoas);
636
  Description: Filters StructuralVariationOverlapAlleles objects by various criteria,
637
               choosing one per variant, allele, gene. It can also flag instead
638
               of filtering.
639
  Returntype : arrayref of Bio::EnsEMBL::Variation::StructuralVariationOverlapAlleles
640
  Exceptions : none
641
  Caller     : get_all_StructuralVariationOverlapAlleles()
642
  Status     : Stable
643

644
=cut
92✔
645

92✔
646
sub filter_StructuralVariationOverlapAlleles {
647
  my $self = shift;
92✔
648
  my $svoas = shift;
649

650
  return [] unless $svoas && scalar @$svoas;
92✔
651

8✔
652
  # pick worst? pick worst per allele?
653
  if($self->{pick} || $self->{pick_allele}) {
654
    return [$self->pick_worst_VariationFeatureOverlapAllele($svoas)];
655
  }
656

×
657
  # pick per gene? pick worst per allele and gene?
658
  elsif($self->{per_gene} || $self->{pick_allele_gene}) {
659
    return $self->pick_VariationFeatureOverlapAllele_per_gene($svoas);
660
  }
661

20✔
662
  # flag picked? flag worst per allele?
20✔
663
  elsif($self->{flag_pick} || $self->{flag_pick_allele}) {
664
    if(my $worst = $self->pick_worst_VariationFeatureOverlapAllele($svoas)) {
665
      $worst->{PICK} = 1;
666
    }
667
  }
668

8✔
669
  # flag worst per allele and gene?
670
  elsif($self->{flag_pick_allele_gene}) {
671
    map {$_->{PICK} = 1} @{$self->pick_VariationFeatureOverlapAllele_per_gene($svoas)};
84✔
672
  }
673

674
  return $svoas;
675

676
}
677

678

679
=head2 pick_worst_VariationFeatureOverlapAllele
680

681
  Arg 1      : arrayref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
682
  Example    : $picked = $of->pick_worst_VariationFeatureOverlapAllele($vfoas);
683
  Description: Selects one VariationFeatureOverlapAllele from a list using criteria
684
               defined in the param pick_order. Criteria are in this default order:
685
                1: MANE_Select 
686
                2: MANE_Plus_Clinical
687
                3: canonical
688
                4: transcript support level
689
                5: biotype (protein coding favoured)
690
                6: consequence rank
691
                7: transcript length
692
                8: transcript from Ensembl?
693
                9: transcript from RefSeq?
694
  Returntype : Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
695
  Exceptions : none
696
  Caller     : filter_VariationFeatureOverlapAlleles(),
697
               filter_StructuralVariationOverlapAlleles()
698
  Status     : Stable
699

700
=cut
368✔
701

368✔
702
sub pick_worst_VariationFeatureOverlapAllele {
703
  my $self = shift;
368✔
704
  my $vfoas = shift || [];
705

368✔
706
  my @vfoa_info;
707

288✔
708
  return $vfoas->[0] if scalar @$vfoas == 1;
709

710
  foreach my $vfoa(@$vfoas) {
844✔
711

712
    # create a hash self info for this VFOA that will be used to rank it
713
    my $info = {
714
      vfoa => $vfoa,
715
      rank => undef,
716

717
      # these will only be used by transcript types, default to 1 for others
718
      # to avoid writing an else clause below
719
      mane_select => 1,
720
      mane_plus_clinical => 1,
721
      canonical => 1,
722
      ccds => 1,
723
      length => 0,
724
      biotype => 1,
725
      tsl => 100,
726
      appris => 100,
727
      ensembl => 1,
728
      refseq => 1,
844✔
729
    };
816✔
730

731
    if($vfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele')) {
732
      my $tr = $vfoa->feature;
816✔
733

816✔
734
      # 0 is "best"
816✔
735
      $info->{mane_select} = scalar(grep {$_->code eq 'MANE_Select'}  @{$tr->get_all_Attributes()}) ? 0 : 1;
816✔
736
      $info->{mane_plus_clinical} = scalar(grep {$_->code eq 'MANE_Plus_Clinical'}  @{$tr->get_all_Attributes()}) ? 0 : 1;
816✔
737
      $info->{canonical} = $tr->is_canonical ? 0 : 1;
816✔
738
      $info->{biotype} = $tr->biotype eq 'protein_coding' ? 0 : 1;
739
      $info->{ccds} = $tr->{_ccds} && $tr->{_ccds} ne '-' ? 0 : 1;
740
      $info->{lc($tr->{_source_cache})} = 0 if exists($tr->{_source_cache});
408✔
741

742
      # "invert" length so longer is best
408✔
743
      $info->{length} = 0 - (
744
        $tr->translation ?
745
        length($tr->{_variation_effect_feature_cache}->{translateable_seq} || $tr->translateable_seq) :
746
        $tr->length()
747
      );
816✔
748

816✔
749
      # lower TSL is best
632✔
750
      if(my ($tsl) = @{$tr->get_all_Attributes('TSL')}) {
751
        if($tsl->value =~ m/tsl(\d+)/) {
752
          $info->{tsl} = $1 if $1;
753
        }
754
      }
816✔
755

308✔
756
      # lower APPRIS is best
308✔
757
      if(my ($appris) = @{$tr->get_all_Attributes('appris')}) {
758
        if($appris->value =~ m/([A-Za-z]).+(\d+)/) {
759
          my ($type, $grade) = ($1, $2);
760

308✔
761
          # values are principal1, principal2, ..., alternative1, alternative2
762
          # so add 10 to grade if alternative
308✔
763
          $grade += 10 if substr($type, 0, 1) eq 'a';
764

765
          $info->{appris} = $grade if $grade;
766
        }
767
      }
844✔
768
    }
769

288✔
770
    push @vfoa_info, $info;
284✔
771
  }
284✔
772
  if(scalar @vfoa_info) {
773
    my @order = @{$self->{pick_order}};
774
    my $picked;
284✔
775

776
    # go through each category in order
777
    foreach my $cat(@order) {
1,152✔
778

8✔
779
      # get ranks here as it saves time
20✔
780
      if($cat eq 'rank') {
20✔
781
        foreach my $info(@vfoa_info) {
782
          my @ocs = sort {$a->rank <=> $b->rank} @{$info->{vfoa}->get_all_OverlapConsequences};
783
          $info->{rank} = scalar @ocs ? $SO_RANKS{$ocs[0]->SO_term} : 1000;
784
        }
785
      }
1,152✔
786

787
      # sort on that category
788
      @vfoa_info = sort {$a->{$cat} <=> $b->{$cat}} @vfoa_info;
1,152✔
789

1,152✔
790
      # take the first (will have the lowest value self $cat)
791
      $picked = shift @vfoa_info;
792
      my @tmp = ($picked);
1,152✔
793

794
      # now add to @tmp those vfoas that have the same value self $cat as $picked
795
      push @tmp, shift @vfoa_info while @vfoa_info && $vfoa_info[0]->{$cat} eq $picked->{$cat};
1,152✔
796

797
      # if there was only one, return
798
      return $picked->{vfoa} if scalar @tmp == 1;
799

872✔
800
      # otherwise shrink the array to just those that had the lowest
801
      # this gives fewer to sort on the next round
802
      @vfoa_info = @tmp;
803

804
    }
4✔
805

806
    # probably shouldn't get here, but if we do, return the first
807
    return $vfoa_info[0]->{vfoa};
4✔
808
  }
809

810
  return undef;
811
}
812

813

814
=head2 pick_VariationFeatureOverlapAllele_per_gene
815

816
  Arg 1      : arrayref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
817
  Example    : $picked = $of->pick_VariationFeatureOverlapAllele_per_gene($vfoas);
818
  Description: Selects one VariationFeatureOverlapAllele per gene affected by the
819
               given list, using pick_worst_VariationFeatureOverlapAllele to select
820
               within in each gene.
821
  Returntype : arrayref of Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele
822
  Exceptions : none
823
  Caller     : filter_VariationFeatureOverlapAlleles(),
824
               filter_StructuralVariationOverlapAlleles()
825
  Status     : Stable
826

827
=cut
48✔
828

48✔
829
sub pick_VariationFeatureOverlapAllele_per_gene {
830
  my $self = shift;
48✔
831
  my $vfoas = shift;
24✔
832

833
  my @return;
834
  my @tvas;
48✔
835

156✔
836
  # pick out TVAs
144✔
837
  foreach my $vfoa(@$vfoas) {
838
    if($vfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele')) {
839
      push @tvas, $vfoa;
12✔
840
    }
841
    else {
842
      push @return, $vfoa;
843
    }
844
  }
48✔
845

846
  # sort the TVA objects into a hash by gene
48✔
847
  my %by_gene;
144✔
848

144✔
849
  foreach my $tva(@tvas) {
850
    my $gene = $tva->transcript->{_gene_stable_id} || $self->{ga}->fetch_by_transcript_stable_id($tva->transcript->stable_id)->stable_id;
851
    push @{$by_gene{$gene}}, $tva;
48✔
852
  }
84✔
853

854
  foreach my $gene(keys %by_gene) {
855
    push @return, grep {defined($_)} $self->pick_worst_VariationFeatureOverlapAllele($by_gene{$gene});
48✔
856
  }
857

858
  return \@return;
859
}
860

861

862
### Transforming methods
863
### Typically these methods will transform an API object into hash-able data
864
### Most will take the object to be transformed and the hash into which the
865
### data are inserted as the arguments, and return the hashref
866
############################################################################
867

868

869
=head2 VariationFeature_to_output_hash
870

871
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature
872
  Example    : $hashref = $of->VariationFeature_to_output_hash($vf);
873
  Description: "Flattens" a VariationFeature to a simple hash, adding information
874
               pertaining just to the variant's genomic location.
875
  Returntype : hashref
876
  Exceptions : none
877
  Caller     : get_all_output_hashes_by_VariationFeature()
878
  Status     : Stable
879

880
=cut
16,348✔
881

16,348✔
882
sub VariationFeature_to_output_hash {
883
  my $self = shift;
8,174✔
884
  my $vf = shift;
885

8,174✔
886
  my $var_name = sprintf(
887
    '%s_%i_%s',
888
    $vf->{original_chr} || $vf->{chr},
16,348✔
889
    $vf->{start},
890
    $vf->{allele_string} || $vf->{class_SO_term}
16,348✔
891
  );
892

16,348✔
893
  my $hash = {
16✔
894
    Uploaded_variation => ($vf->variation_name ne "." && $vf->variation_name ne $var_name) ? $vf->variation_name :  ($vf->{original_chr} || $vf->{chr}).'_'.$vf->{start}.'_'.($vf->{original_allele_string} || $vf->{allele_string} || $vf->{class_SO_term}),
×
895
    Location            => ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->{start}, $vf->{end}),
×
896
  };
897

×
898
  my $converted_to_vcf = $vf->to_VCF_record;
×
899

900
  my $alt_allele_vcf = ${$converted_to_vcf}[4];
×
901

902
  if($self->{vcf_string}){
903
    if($alt_allele_vcf =~ /,/){
16✔
904
      my @list_vcfs;
905
      my @alt_splited_list = split(q(,), $alt_allele_vcf);
906

907
      foreach my $alt_splited (@alt_splited_list){
908
        push(@list_vcfs, $vf->{chr}.'-'.${$converted_to_vcf}[1].'-'.${$converted_to_vcf}[3].'-'.$alt_splited);
909
      }
16,348✔
910
      $hash->{vcf_string} = \@list_vcfs;
8✔
911
    }
8✔
912
    else{
8✔
913
      $hash->{vcf_string} = $vf->{chr}.'-'.${$converted_to_vcf}[1].'-'.${$converted_to_vcf}[3].'-'.${$converted_to_vcf}[4];
914
    }
915
  }
916

16,348✔
917
  # get variation synonyms for Variant Recoder
4✔
918
  # if the tool is Variant Recoder fetches the variation synonyms from the database
919
  if($self->{_config}->{_params}->{is_vr} && $self->{var_synonyms}){
920
    my $variation = $vf->variation();
921
    my $var_synonyms = $variation->get_all_synonyms('', 1);
16,348✔
922
    $hash->{var_synonyms} = $var_synonyms;
923
  }
924

16,348✔
925
  # overlapping SVs
24✔
926
  if($vf->{overlapping_svs}) {
927
    $hash->{SV} = [sort keys %{$vf->{overlapping_svs}}];
928
  }
24✔
929

24✔
930
  # variant class
24✔
931
  $hash->{VARIANT_CLASS} = $vf->class_SO_term() if $self->{variant_class};
932

933
  # individual?
934
  if(defined($vf->{individual})) {
935
    $hash->{IND} = $vf->{individual};
16,348✔
936

16✔
937
    # zygosity
16✔
938
    if(defined($vf->{genotype})) {
28✔
939
    my %unique = map {$_ => 1} @{$vf->{genotype}};
28✔
940
    $hash->{ZYG} = (scalar keys %unique > 1 ? 'HET' : 'HOM').(defined($vf->{hom_ref}) ? 'REF' : '');
941
    }
16✔
942
  }
943
  
944
  # individual_zyg
945
  if(defined($vf->{genotype_ind})) {
16,348✔
946
    my @tmp;
947
    foreach my $geno_ind (keys %{$vf->{genotype_ind}}) {
948
      my %unique = map {$_ => 1} @{$vf->{genotype_ind}->{$geno_ind}};
16,348✔
949
      push @tmp, $geno_ind.":".(scalar keys %unique > 1 ? 'HET' : 'HOM').(defined($vf->{hom_ref_samples}->{$geno_ind}) ? 'REF' : '');
16,332✔
950
    }
951
    $hash->{ZYG} = \@tmp;
16,332✔
952
  }
4✔
953

954
  # minimised?
955
  my $minimal_flag = $self->{_config}->{_params}->{minimal} if defined($self->{_config}->{_params}->{minimal}) && $self->{_config}->{_params}->{minimal}==1;
956
  $hash->{MINIMISED} = 1 if $vf->{minimised} && defined($minimal_flag);
957
  
16,348✔
958
  
20✔
959
  if(ref($vf) eq 'Bio::EnsEMBL::Variation::VariationFeature') {
960
    my $ambiguity_code = $vf->ambig_code();
961
    
962
    if($self->{ambiguity} && defined($ambiguity_code)) {
34✔
963
      $hash->{AMBIGUITY} = $ambiguity_code;
28✔
964
    }
965
  }
14✔
966

967
  # custom annotations
968
  foreach my $custom_name(keys %{$vf->{_custom_annotations} || {}}) {
969
    $self->_add_custom_annotations_to_hash(
970
      $hash,
16,348✔
971
      $custom_name,
972
      [
973
        grep {!exists($_->{allele})}
16,348✔
974
        @{$vf->{_custom_annotations}->{$custom_name}}
975
      ],
16,348✔
976
      $vf->{_custom_annotations_stats}->{$custom_name}
977
    );
16,348✔
978
  }
979

980
  # nearest
981
  $hash->{NEAREST} = $vf->{nearest} if $vf->{nearest};
982

983
  # check_ref tests
984
  $hash->{CHECK_REF} = 'failed' if defined($vf->{check_ref_failed});
985

986
  $self->stats->log_VariationFeature($vf, $hash) unless $self->{no_stats};
987

988
  return $hash;
989
}
990

991

992
=head2 add_colocated_variant_info
993

994
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature $vf
995
  Arg 2      : hashref $vf_hash
75,448✔
996
  Example    : $hashref = $of->add_colocated_variant_info($vf, $vf_hash);
75,448✔
997
  Description: Adds co-located variant information to hash
75,448✔
998
  Returntype : hashref
999
  Exceptions : none
75,448✔
1000
  Caller     : VariationFeature_to_output_hash()
1001
  Status     : Stable
13,052✔
1002

13,052✔
1003
=cut
1004

13,052✔
1005
sub add_colocated_variant_info {
13,052✔
1006
  my $self = shift;
13,052✔
1007
  my $vf = shift;
1008
  my $hash = shift;
13,052✔
1009

1010
  return unless $vf->{existing} && scalar @{$vf->{existing}};
13,052✔
1011

1012
  my $this_allele = $hash->{Allele};
1013

1014
  my $shifted_allele = $vf->{shifted_allele_string};
1015
  $shifted_allele ||= "";
1016
  my $tmp = {};
1017

1018
  my $clin_sig_allele_exists = 0;
1019
  # use these to sort variants
1020
  my %prefix_ranks = (
13,052✔
1021
    'rs' => 1, # dbSNP
1022

13,052✔
1023
    'cm' => 2, # HGMD
1024
    'ci' => 2,
6,526✔
1025
    'cd' => 2,
1026

1027
    'co' => 3, # COSMIC
1028
  );
1,004✔
1029

1030
  my %clin_sigs;
6,526✔
1031

1032
  foreach my $ex(
1033
    sort {
1034
      ($a->{somatic} || 0) <=> ($b->{somatic} || 0) ||
14,884✔
1035

13,052✔
1036
      ($prefix_ranks{lc(substr($a->{variation_name}, 0, 2))} || 100)
1037
      <=>
1038
      ($prefix_ranks{lc(substr($b->{variation_name}, 0, 2))} || 100)
1039
    }
14,884✔
1040
    @{$vf->{existing}}
1041
  ) {
1042

1043
    # check allele match
14,884✔
1044
    if(my $matched = $ex->{matched_alleles}) {
1045
      next unless (grep {$_->{a_allele} eq $this_allele} @$matched) || (grep {$_->{a_allele} eq $shifted_allele} @$matched) ;
1046
    }
14,884✔
1047

1048
    # ID
1049
    push @{$hash->{Existing_variation}}, $ex->{variation_name} if $ex->{variation_name};
×
1050

×
1051
    # Variation Synonyms
×
1052
    # VEP fetches the variation synonyms from the cache
1053
    push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr};
1054

×
1055
    # ClinVar somatic classification
×
1056
    if(defined($ex->{clinical_impact}) && $self->{clinvar_somatic_classification}) {
×
1057
      $hash->{CLINVAR_SOMATIC_CLASSIFICATION} = $ex->{clinical_impact};
×
1058
    }
1059

×
1060
    if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} )
×
1061
    {
1062

1063
      my %cs_hash;
×
1064
      my @clin_sig_array = split(';', $ex->{clin_sig_allele});
×
1065
      foreach my $cs(@clin_sig_array){
×
1066
        my @cs_split = split(':', $cs);
1067

1068
        $cs_hash{$cs_split[0]} = '' if !defined($cs_hash{$cs_split[0]});
×
1069
        $cs_hash{$cs_split[0]} .= ',' if $cs_hash{$cs_split[0]} ne ''; 
×
1070
        $cs_hash{$cs_split[0]} .= $cs_split[1];
×
1071
      }
1072

1073
      my $hash_ref = \%cs_hash;
1074
      $clin_sigs{$hash_ref->{$this_allele}} = 1 if defined($hash_ref->{$this_allele});
14,884✔
1075
      $clin_sig_allele_exists = 1;
14,884✔
1076
    }
1077

1078
    # clin sig and pubmed?
14,884✔
1079
    push @{$tmp->{CLIN_SIG}}, split(',', $ex->{clin_sig}) if $ex->{clin_sig} && !$clin_sig_allele_exists;
1080
    push @{$tmp->{PUBMED}}, split(',', $ex->{pubmed}) if $self->{pubmed} && $ex->{pubmed};
1081

14,884✔
1082
    # somatic?
1083
    push @{$tmp->{SOMATIC}}, $ex->{somatic} ? 1 : 0;
1084

1085
    # phenotype or disease
13,052✔
1086
    push @{$tmp->{PHENO}}, $ex->{phenotype_or_disease} ? 1 : 0;   
27,312✔
1087
  }
1088

1089
  # post-process to remove all-0, e.g. SOMATIC
1090
  foreach my $key(keys %$tmp) {
13,052✔
1091
    delete $tmp->{$key} unless grep {$_} @{$tmp->{$key}};
1092
  }
13,052✔
1093
  
13,052✔
1094
  # post-process to merge var synonyms into one entry so we can control the delimiter
1095
  $hash->{VAR_SYNONYMS} = join '--', @{$hash->{VAR_SYNONYMS}} if defined($hash->{VAR_SYNONYMS});
1096

13,052✔
1097
  my @keys = keys(%clin_sigs);
1098
  $tmp->{CLIN_SIG} = join(';', @keys) if scalar(@keys) && $self->{clin_sig_allele};
13,052✔
1099
 
124✔
1100
  # copy to hash
1101
  $hash->{$_} = $tmp->{$_} for keys %$tmp;
124✔
1102
  # frequencies used to filter will appear here
124✔
1103
  if($vf->{_freq_check_freqs}) {
62✔
1104
    my @freqs;
1105

1106
    foreach my $p(keys %{$vf->{_freq_check_freqs}}) {
1107
      foreach my $a(keys %{$vf->{_freq_check_freqs}->{$p}}) {
62✔
1108
        push @freqs, sprintf(
1109
          '%s:%s:%s',
1110
          $p,
1111
          $a,
1112
          $vf->{_freq_check_freqs}->{$p}->{$a}
124✔
1113
        )
1114
      }
1115
    }
13,052✔
1116

1117
    $hash->{FREQS} = \@freqs;
1118
  }
1119

1120
  return $hash;
1121
}
1122

1123

1124

1125
=head2 add_colocated_frequency_data
1126

1127
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature $vf
1128
  Arg 2      : hashref $vf_hash
1129
  Arg 3      : hashref $existing_variant_hash
1130
  Example    : $hashref = $of->add_colocated_frequency_data($vf, $vf_hash, $ex);
1131
  Description: Adds co-located variant frequency data information to hash
1132
  Returntype : hashref
1133
  Exceptions : none
1134
  Caller     : VariationFeatureOverlapAllele_to_output_hash()
1135
  Status     : Stable
16,160✔
1136

16,160✔
1137
=cut
1138

16,160✔
1139
sub add_colocated_frequency_data {
1140
  my $self = shift;
16,040✔
1141
  my ($vf, $hash, $ex, $shift_hash) = @_;
1142

16,040✔
1143
  return $hash unless grep {$self->{$_}} keys %FREQUENCY_KEYS or $self->{max_af};
16,040✔
1144

1145
  my @ex_alleles = split('/', $ex->{allele_string});
16,040✔
1146

16,040✔
1147
  my @keys = keys %FREQUENCY_KEYS;
16,040✔
1148
  @keys = grep {$self->{$_}} @keys unless $self->{max_af};
1149
  
16,040✔
1150
  my $this_allele = $hash->{Allele} ||= '-'; #if exists($hash->{Allele});
1151
  my $this_allele_unshifted = $shift_hash->{alt_orig_allele_string} if defined($shift_hash);
16,040✔
1152
  $this_allele_unshifted ||= "";
1153
  
16,040✔
1154
  my ($matched_allele) = grep {$_->{a_allele} eq $this_allele || $_->{a_allele} eq $this_allele_unshifted} @{$ex->{matched_alleles} || []};
16,040✔
1155

1156
  return $hash unless $matched_allele || (grep {$_ eq 'af'} @keys);
16,040✔
1157

80,012✔
1158
  my $max_af = 0;
1159
  my @max_af_pops;
305,100✔
1160
  
305,100✔
1161
  foreach my $group(sort @keys) {
1162
    foreach my $key(grep {$ex->{$_}} @{$FREQUENCY_KEYS{$group}}) {
1163

305,100✔
1164
      my %freq_data;
1165
      my $total = 0;
1166

305,100✔
1167
      # use this to log which alleles we've explicitly seen freqs for
314,640✔
1168
      my %remaining = map {$_ => 1} @ex_alleles;
314,640✔
1169

314,640✔
1170
      # get the frequencies for each allele into a hashref
314,640✔
1171
      foreach my $pair(split(',', $ex->{$key})) {
314,640✔
1172
        my ($a, $f) = split(':', $pair);
1173
        $f = sprintf("%.4f", $f) if $key eq 'AF'; # this format is just to keep old compability with dbSNP import
1174
        $freq_data{$a} = $f;
1175
        $total += $f;
1176
        delete $remaining{$a} if $remaining{$a};
1177
      }
305,100✔
1178

305,100✔
1179
      # interpolate the frequency for the remaining allele if there's only 1
136✔
1180
      # we can only do this reliably for the AF key as only the minor AF is stored
136✔
1181
      # for others we expect all ALTs to have a store frequency, those without cannot be reliably interpolated
1182
      my $interpolated = 0;
1183
      if(scalar @ex_alleles == 2 && scalar keys %remaining == 1 && $key eq 'AF') {
305,100✔
1184
        $freq_data{(keys %remaining)[0]} = 1 - $total;
1185
        $interpolated = 1;
1186
      }
1187

1188
      if(
151,170✔
1189
        ($matched_allele && exists($freq_data{$matched_allele->{b_allele}})) ||
1190
        ($interpolated && $freq_data{$this_allele})
1191
      ) {
151,170✔
1192

1193
        my $f =
1194
          $matched_allele && exists($freq_data{$matched_allele->{b_allele}}) ?
302,340✔
1195
          $freq_data{$matched_allele->{b_allele}} :
195,112✔
1196
          $freq_data{$this_allele};
195,112✔
1197

1198
        # record the frequency if requested
1199
        if($self->{$group}) {
1200
          my $out_key = $key eq 'AF' ? 'AF' : $key.'_AF';
1201
          merge_arrays($hash->{$out_key} ||= [], [$f]);
302,340✔
1202
        }
276,260✔
1203

16,704✔
1204
        # update max_af data if required
16,704✔
1205
        # make sure we don't include any combined-level pops
1206
        if($self->{max_af} && $key ne 'AF' && $key ne 'ExAC' && $key ne 'ExAC_Adj' && $key ne 'gnomADe' && $key ne 'gnomADg') {
1207
          if($f > $max_af) {
19,572✔
1208
            $max_af = $f;
1209
            @max_af_pops = ($key);
1210
          }
1211
          elsif($f == $max_af) {
1212
            push @max_af_pops, $key unless grep{$_ eq $key} @max_af_pops;
1213
          }
1214
        }
1215
      }
16,040✔
1216
    }
13,972✔
1217
  }
1218

13,972✔
1219
  # add/update max_af info
13,972✔
1220
  if($self->{max_af} && @max_af_pops) {
13,972✔
1221
    my $current_max = $hash->{MAX_AF} ||= 0;
1222

1223
    if($max_af > $current_max) {
13,972✔
1224
      $hash->{MAX_AF} = $max_af;
1225
      $hash->{MAX_AF_POPS} = [];
1226
    }
16,040✔
1227
    
1228
    push @{$hash->{MAX_AF_POPS}}, @max_af_pops if $max_af >= $current_max;
1229
  }
1230

1231
  return $hash;
1232
}
1233

1234

1235
=head2 _add_custom_annotations_to_hash
1236

1237
  Arg 1      : hashref $vf_hash
1238
  Arg 2      : string $custom_name
1239
  Arg 3      : listref $annotations
1240
  Arg 4      : listref $annotations_statistics
1241
  Example    : $hashref = $of->_add_custom_annotations_to_hash($vf, $vf_hash, $ex);
1242
  Description: Adds custom annotation data to hash
1243
  Returntype : hashref
1244
  Exceptions : none
1245
  Caller     : VariationFeatureOverlapAllele_to_output_hash(),
1246
               VariationFeature_to_output_hash()
1247
  Status     : Stable
88✔
1248

1249
=cut
88✔
1250

72✔
1251
sub _add_custom_annotations_to_hash {
72✔
1252
  my ($self, $hash, $custom_name, $annots, $annots_stats) = @_;
72✔
1253

1254
  foreach my $annot(@$annots) {
1255
    push @{$hash->{$custom_name}}, $annot->{name};
1256
    foreach my $field(keys %{$annot->{fields} || {}}) {
88✔
1257
      push @{$hash->{$custom_name.'_'.$field}}, $annot->{fields}->{$field};
×
1258
    }
1259
  }
1260

88✔
1261
  foreach my $stat(keys %{$annots_stats || {}}) {
1262
    $hash->{$custom_name.'_'.$stat} = $annots_stats->{$stat};
1263
  }
1264

1265
  return $hash;
1266
}
1267

1268

1269
=head2 VariationFeatureOverlapAllele_to_output_hash
1270

1271
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele $vfoa
1272
  Arg 2      : hashref $vf_hash
1273
  Arg 3      : Bio::EnsEMBL::Variation::VariationFeature $vf
1274
  Example    : $hashref = $of->VariationFeatureOverlapAllele_to_output_hash($vfoa, $vf_hash, $vf);
1275
  Description: Adds generic information to hash applicable to all
1276
               VariationFeatureOverlapAllele sub-classes.
1277
  Returntype : hashref
1278
  Exceptions : none
1279
  Caller     : TranscriptVariationAllele_to_output_hash(),
1280
               RegulatoryFeatureVariationAllele_to_output_hash(),
1281
               MotifFeatureVariationAllele_to_output_hash(),
1282
               IntergenicVariationAllele_to_output_hash()
1283
  Status     : Stable
88,660✔
1284

88,660✔
1285
=cut
1286

88,660✔
1287
sub VariationFeatureOverlapAllele_to_output_hash {
1288
  my $self = shift;
1289
  my ($vfoa, $hash, $vf) = @_;
88,660✔
1290

88,660✔
1291
  my @ocs = sort {$a->rank <=> $b->rank} @{$vfoa->get_all_OverlapConsequences};
1292

1293
  # consequence type(s)
88,660✔
1294
  my $term_method = $self->{terms}.'_term';
1295
  $hash->{Consequence} = [map {$_->$term_method || $_->SO_term} @ocs];
1296

88,660✔
1297
  # impact
1298
  $hash->{IMPACT} = $ocs[0]->impact() if @ocs;
88,660✔
1299

1300
  # allele
×
1301
  $hash->{Allele} = $vfoa->variation_feature_seq; 
1302

1303
  if(defined($vfoa->{shift_hash})&& defined($vfoa->{shift_hash}->{hgvs_allele_string}) && $self->param('shift_genomic'))
88,660✔
1304
  {
1305
    $hash->{Allele} = $vfoa->{shift_hash}->{hgvs_allele_string};
1306
  }
88,660✔
1307
  # allele number
1308
  $hash->{ALLELE_NUM} = $vfoa->allele_number if $self->{allele_number};
88,660✔
1309

1310
  # reference allele
1311
  $hash->{REF_ALLELE} = $vf->ref_allele_string if $self->{show_ref_allele};
88,660✔
1312

1313
  # Capture uploaded allele
1314
  $hash->{UPLOADED_ALLELE} = ($vf->{original_allele_string} || $vf->{allele_string} || $vf->{class_SO_term} || "" ) if $self->param('uploaded_allele');
88,660✔
1315

652✔
1316
  # picked?
1317
  $hash->{PICK} = 1 if defined($vfoa->{PICK});
652✔
1318

652✔
1319
  # hgvs g.
1320
  if($self->{hgvsg}) {
1321
    # if offline mode then hgvsg_use_accession has to access chromosome synonyms from cache
1322
    if(!$vf->slice->adaptor && $self->{hgvsg_use_accession}) {
88,660✔
1323
      my $chr_syn = $self->config->{_chromosome_synonyms}->{($vf->{chr})};
712✔
1324
      my @new_chr_array = grep(/NC_/, keys %{$chr_syn});
1325

712✔
1326
      my %hgvs_genomic = %{ $vf->hgvs_genomic($vf->slice, $new_chr_array[0]) };
712✔
1327
      $vf->{_hgvs_genomic} ||= {};
1328
      foreach (keys %hgvs_genomic) {
712✔
1329
        $vf->{_hgvs_genomic}->{$_} = $hgvs_genomic{$_};
44✔
1330
      }
44✔
1331
    }
44✔
1332
    else {
44✔
1333
      my %hgvs_genomic = %{ $vf->hgvs_genomic($vf->slice, $self->{hgvsg_use_accession} ? undef : $vf->{chr}) };
1334
      $vf->{_hgvs_genomic} ||= {};
44✔
1335
      foreach (keys %hgvs_genomic) {
44✔
1336
        $vf->{_hgvs_genomic}->{$_} = $hgvs_genomic{$_};
1337
      }
1338
    }
1339

1340
    # for multi-allelic the alleles can be more than 2
1341
    my @all_alleles = split(/\//, $vf->{allele_string});
1342
    my $allele_string = scalar @all_alleles > 2 ? 
88,660✔
1343
      $all_alleles[$vfoa->allele_number] :
46✔
1344
      $all_alleles[1];
1345

1346
    if(my $hgvsg = $vf->{_hgvs_genomic}->{$allele_string}) {
1347
      $hash->{HGVSg} = $hgvsg; 
76✔
1348
    }
60✔
1349
  }
1350
  # spdi + ga4gh_vrs
1351
  if($self->{spdi} || $self->{ga4gh_vrs}) {
1352
    $vf->{_spdi_genomic} = $vf->spdi_genomic(); 
1353

1354
    if (my $spdi = $vf->{_spdi_genomic}->{$hash->{Allele}}) {
88,660✔
1355
      $hash->{SPDI} = $spdi if $self->{spdi};
1356

1357
      if ($self->{ga4gh_vrs} && $spdi =~ /^NC/) {
88,660✔
1358
        my $ga4gh_vrs = ga4gh_vrs_from_spdi($spdi);
22,592✔
1359
        if ($ga4gh_vrs) {
1360
          throw("ERROR: Cannot use --ga4gh_vrs without JSON module installed\n") unless $CAN_USE_JSON;
1361
          my $json = JSON->new;
88,660✔
1362
          # avoid encoding JSON twice in case of returning JSON output format
1363
          $ga4gh_vrs = $json->encode($ga4gh_vrs) if $self->{output_format} ne 'json';
1364
          $hash->{GA4GH_VRS} = $ga4gh_vrs;
1365
        }
1366
      }
1367
    }
1368
  }
1369

1370
  # custom annotations
1371
  foreach my $custom_name(keys %{$vf->{_custom_annotations} || {}}) {
1372
    $self->_add_custom_annotations_to_hash(
1373
      $hash,
1374
      $custom_name,
1375
      [
1376
        grep {$_->{allele} && ($_->{allele} eq $hash->{Allele})}
1377
        @{$vf->{_custom_annotations}->{$custom_name}}
1378
      ]
1379
    );
1380
  }
86,608✔
1381

86,608✔
1382
  # colocated
1383
  $self->add_colocated_variant_info($vf, $hash);
86,608✔
1384

86,608✔
1385
  # frequency data
86,608✔
1386
  foreach my $ex(@{$vf->{existing} || []}) {
1387
    $self->add_colocated_frequency_data($vf, $hash, $ex, $vfoa->{shift_hash});
1388
  }
86,608✔
1389

86,608✔
1390
  return $hash;
86,608✔
1391
}
1392

1393

86,608✔
1394
=head2 BaseTranscriptVariationAllele_to_output_hash
1395

1396
  Arg 1      : Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele $btva
86,608✔
1397
  Arg 2      : hashref $vf_hash
1398
  Example    : $hashref = $of->BaseTranscriptVariationAllele_to_output_hash($btva, $vf_hash);
86,608✔
1399
  Description: Adds information to hash applicable to transcript.
1400
  Returntype : hashref
1401
  Exceptions : none
86,608✔
1402
  Caller     : TranscriptVariationAllele_to_output_hash(),
86,608✔
1403
               TranscriptStructuralVariationAllele_to_output_hash()
1404
  Status     : Stable
1405

86,608✔
1406
=cut
17,876✔
1407

15,252✔
1408
sub BaseTranscriptVariationAllele_to_output_hash {
15,252✔
1409
  my $self = shift;
1410
  my ($vfoa, $hash) = @_;
1411

17,876✔
1412
  my $tv = $vfoa->base_variation_feature_overlap;
1,660✔
1413
  my $tr = $tv->transcript;
1,084✔
1414
  my $pre = $vfoa->_pre_consequence_predicates;
1415

1416
  # basics
1417
  $hash->{Feature_type} = 'Transcript';
1418
  $hash->{Feature}      = $tr->stable_id if $tr;
1419
  $hash->{Feature}     .= '.'.$tr->version if $hash->{Feature} && $self->{transcript_version} && $tr->version && $hash->{Feature} !~ /\.\d+$/;
86,608✔
1420

14,144✔
1421
  # get gene
1422
  $hash->{Gene} = $tr->{_gene_stable_id};
14,144✔
1423
  $hash->{Gene} .= '.'.$tr->{_gene_version} if $self->{gene_version} && $tr->{_gene_version} && $hash->{Gene} !~ /\.\d+$/;
1424

14,144✔
1425
  # strand
1426
  $hash->{STRAND} = $tr->strand + 0;
1427

55,908✔
1428
  my @attribs = @{$tr->get_all_Attributes()};
1429

1430
  # flags
55,908✔
1431
  my @flags = grep {substr($_, 0, 4) eq 'cds_'} map {$_->{code}} @attribs;
1432
  $hash->{FLAGS} = \@flags if scalar @flags;
55,908✔
1433

1434
  # exon/intron numbers
1435
  if($self->{numbers}) {
14,144✔
1436
    if($pre->{exon}) {
1437
      if(my $num = $tv->exon_number) {
1438
        $hash->{EXON} = $num;
1439
      }
86,608✔
1440
    }
8,068✔
1441
    if($pre->{intron}) {
1442
      if(my $num = $tv->intron_number) {
1443
        $hash->{INTRON} = $num;
1444
      }
86,608✔
1445
    }
17,872✔
1446
  }
17,872✔
1447

17,872✔
1448
  # protein domains
1449
  if($self->{domains} && $pre->{coding}) {
17,872✔
1450
    my $feats = $tv->get_overlapping_ProteinFeatures;
1451

17,872✔
1452
    my @strings;
1453

17,872✔
1454
    for my $feat (@$feats) {
17,872✔
1455

1456
      # do a join/grep in case self missing data
1457
      my $label = join(':', grep {$_} ($feat->analysis->display_label, $feat->hseqname));
1458

43,304✔
1459
      # replace any special characters
1460
      $label =~ s/[\s;=]/_/g;
1461

43,304✔
1462
      push @strings, $label;
1463
    }
1464

43,304✔
1465
    $hash->{DOMAINS} = \@strings if @strings;
1466
  }
1467

43,304✔
1468
  # distance to transcript
1469
  if(grep {$DISTANCE_CONS{$_}} @{$hash->{Consequence} || []}) {
1470
    $hash->{DISTANCE} = $tv->distance_to_transcript;
86,608✔
1471
  }
852✔
1472

852✔
1473
  # gene symbol
1474
  if($self->{symbol}) {
1475
    my $symbol  = $tr->{_gene_symbol} || $tr->{_gene_hgnc};
86,608✔
1476
    my $source  = $tr->{_gene_symbol_source};
4✔
1477
    my $hgnc_id = $tr->{_gene_hgnc_id} if defined($tr->{_gene_hgnc_id});
1478

1479
    $hash->{SYMBOL} = $symbol if defined($symbol) && $symbol ne '-';
1480
    ## encode spaces in symbol name for VCF
43,304✔
1481
    $hash->{SYMBOL} =~ s/\s/\%20/g if $hash->{SYMBOL} && $self->{output_format} eq 'vcf' && !$self->{no_escape};
1482

1483
    $hash->{SYMBOL_SOURCE} = $source if defined($source) && $source ne '-';
43,304✔
1484
    $hash->{HGNC_ID} = $hgnc_id if defined($hgnc_id) && $hgnc_id ne '-';
1485
  }
1486

86,608✔
1487
  # CCDS
17,876✔
1488
  $hash->{CCDS} = $tr->{_ccds} if
71,504✔
1489
    $self->{ccds} &&
71,504✔
1490
    defined($tr->{_ccds}) &&
71,504✔
1491
    $tr->{_ccds} ne '-';
1492

1493
  # refseq xref
1494
  $hash->{RefSeq} = [split(',', $tr->{_refseq})] if
1495
    $self->{xref_refseq} &&
86,608✔
1496
    defined($tr->{_refseq}) &&
1497
    $tr->{_refseq} ne '-';
1498

86,608✔
1499
  # refseq match info
1500
  if($self->{refseq} || $self->{merged}) {
1501
    my @rseq_attrs = grep {$_->code =~ /^rseq/} @attribs;
86,608✔
1502
    $hash->{REFSEQ_MATCH} = [map {$_->code} @rseq_attrs] if scalar @rseq_attrs;
1503
  }
1504

86,608✔
1505
  if(my $status = $tr->{_bam_edit_status}) {
86,608✔
1506
    $hash->{BAM_EDIT} = uc($status);
32✔
1507
  }
32✔
1508

1509
  # protein ID
1510
  $hash->{ENSP} = $tr->{_protein} if
1511
    $self->{protein} &&
86,608✔
1512
    defined($tr->{_protein}) &&
×
1513
    $tr->{_protein} ne '-';
×
1514

1515
  # uniprot
1516
  if($self->{uniprot}) {
1517
    for my $db(qw(swissprot trembl uniparc uniprot_isoform)) {
1518
      my $id = $tr->{'_'.$db};
86,608✔
1519
      $id = undef if defined($id) && $id eq '-';
17,872✔
1520
      $hash->{uc($db)} = [split(',', $id)] if defined($id);
17,532✔
1521
    }
1522
  }
1523

1524
  # canonical transcript
1525
  $hash->{CANONICAL} = 'YES' if $self->{canonical} && $tr->is_canonical;
86,608✔
1526

8,380✔
1527
  # biotype
8,380✔
1528
  $hash->{BIOTYPE} = $tr->biotype if $self->{biotype} && $tr->biotype;
8,380✔
1529

8,380✔
1530
  # source cache self transcript if using --merged
1531
  $hash->{SOURCE} = $tr->{_source_cache} if defined $tr->{_source_cache};
1532

1533
  # gene phenotype
1534
  $hash->{GENE_PHENO} = 1 if $self->{gene_phenotype} && $tr->{_gene_phenotype};
86,608✔
1535
  if($self->{mane_select} && (my ($mane) = grep {$_->code eq 'MANE_Select'} @attribs)) {
36✔
1536
    if(my $mane_value = $mane->value) {
1537
      $hash->{MANE_SELECT} = $mane_value;
12✔
1538
    }
1539

12✔
1540
    push @{ $hash->{MANE} }, 'MANE_Select';
1541
  }
12✔
1542

1543
  if($self->{mane_plus_clinical} && (my ($mane) = grep {$_->code eq 'MANE_Plus_Clinical'} @attribs)) {
1544
    if(my $mane_value = $mane->value) {
1545
      $hash->{MANE_PLUS_CLINICAL} = $mane_value;
1546
    }
1547

12✔
1548
    push @{ $hash->{MANE} }, 'MANE_Plus_Clinical';
1549
  }
1550
 
12✔
1551
  # Gencode primary
12✔
1552
  if($self->{flag_gencode_primary} && (my ($gencode_primary) = grep {$_->code eq 'gencode_primary'} @attribs)) {
132✔
1553
    $hash->{GENCODE_PRIMARY} = 1;
132✔
1554
  }
1555
  
1556
  # transcript support level
1557
  if($self->{tsl} && (my ($tsl) = grep {$_->code eq 'TSL'} @attribs)) {
12✔
1558
    if($tsl->value =~ m/tsl(\d+)/) {
12✔
1559
      $hash->{TSL} = $1 if $1;
16✔
1560
    }
16✔
1561
  }
16✔
1562

1563
  # APPRIS
1564
  if($self->{appris} && (my ($appris) = grep {$_->code eq 'appris'} @attribs)) {
1565
    if(my $value = $appris->value) {
12✔
1566
      $value =~ s/principal/P/;
1567
      $value =~ s/alternative/A/;
1568
      $hash->{APPRIS} = $value;
1569
    }
1570
  }
1571

12✔
1572
  # miRNA structure
1573
  if($self->{mirna} && $tr->biotype eq 'miRNA' &&
1574
     (my ($mirna_attrib) = grep {$_->code eq 'ncRNA'} @attribs)) {
1575

86,608✔
1576
    my ($start, $end, $struct) = split /\s+|\:/, $mirna_attrib->value;
1577

1578
    my ($cdna_start, $cdna_end) = ($tv->cdna_start, $tv->cdna_end);
1579

1580
    if(
1581
      defined($struct) && $struct =~ /[\(\.\)]+/ &&
1582
      $start && $end && $cdna_start && $cdna_end &&
1583
      overlap($start, $end, $cdna_start, $cdna_end)
1584
    ) {
1585
      # account for insertions
1586
      ($cdna_start, $cdna_end) = ($cdna_end, $cdna_start) if $cdna_start > $cdna_end;
1587
    
1588
      # parse out structure
1589
      my @struct;
1590
      while($struct =~ m/([\.\(\)])([0-9]+)?/g) {
1591
        my $num = $2 || 1;
1592
        push @struct, $1 for(1..$num);
1593
      }
86,476✔
1594
      
86,476✔
1595
      # get struct element types overlapped by variant
1596
      my %chars;
1597
      for my $pos($cdna_start..$cdna_end) {
86,476✔
1598
        $pos -= $start;
86,476✔
1599
        next if $pos < 0 or $pos > scalar @struct;
1600
        $chars{$struct[$pos]} = 1;
86,476✔
1601
      }
86,476✔
1602
      
1603
      # map element types to SO terms
86,476✔
1604
      my %map = (
1605
        '(' => 'miRNA_stem',
86,476✔
1606
        ')' => 'miRNA_stem',
1607
        '.' => 'miRNA_loop'
86,476✔
1608
      );
86,476✔
1609
      
1610
      $hash->{miRNA} = [sort map {$map{$_}} keys %chars];
86,476✔
1611
    }
1612
  }
86,476✔
1613
  return $hash;
1614
}
×
1615

×
1616

1617
=head2 TranscriptVariationAllele_to_output_hash
1618

86,476✔
1619
  Arg 1      : Bio::EnsEMBL::Variation::TranscriptVariationAllele $tva
1620
  Arg 2      : hashref $vf_hash
86,476✔
1621
  Example    : $hashref = $of->TranscriptVariationAllele_to_output_hash($tva, $vf_hash);
1622
  Description: Adds information to hash applicable to transcript and allele combination.
86,476✔
1623
  Returntype : hashref
1624
  Exceptions : none
1625
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
78,428✔
1626
  Status     : Stable
73,224✔
1627

73,224✔
1628
=cut
1629

73,224✔
1630
sub TranscriptVariationAllele_to_output_hash {
1631
  my $self = shift;
1632
  my ($vfoa, $hash) = @_;
73,224✔
1633

1634
  # run "super" methods
67,364✔
1635
  $hash = $self->VariationFeatureOverlapAllele_to_output_hash(@_);
67,364✔
1636
  $hash = $self->BaseTranscriptVariationAllele_to_output_hash(@_);
67,364✔
1637
  
1638
  my $shift_length = (defined($vfoa->{shift_hash}) ? $vfoa->{shift_hash}->{shift_length} : 0);
67,364✔
1639
  $shift_length ||= 0;
33,682✔
1640
  
33,682✔
1641
  return undef unless $hash;
1642

67,364✔
1643
  $hash->{SHIFT_LENGTH} = $shift_length if ($self->param('shift_3prime') && $self->param('shift_length')); 
33,682✔
1644

33,682✔
1645
  my $tv = $vfoa->base_variation_feature_overlap;
1646
  my $tr = $tv->transcript;
67,364✔
1647
  
1648
  my $strand = defined($tr->strand) ? $tr->strand : 1;
1649

78,428✔
1650
  if($self->{shift_genomic})
1651
  {
78,428✔
1652
    my $vf = $vfoa->variation_feature;
46,836✔
1653
    $hash->{Location} = ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->{start} + ($shift_length * $strand), $vf->{end} + ($shift_length * $strand));
46,836✔
1654
  }
1655

46,836✔
1656
  my $vep_cache = $tr->{_variation_effect_feature_cache};
46,836✔
1657

1658
  my $pre = $vfoa->_pre_consequence_predicates();
1659

78,428✔
1660
  # Only for RefSeq annotations
46,836✔
1661
  # If invalid_alleles is set to 1 then the ref and alt alleles are the same -> this is an invalid variant
46,836✔
1662
  # Print mismatch warning only if:
46,836✔
1663
  #  - there is transcripts mismatch
1664
  #  - use_given_ref is not defined
1665
  if($vfoa->{invalid_alleles} && !$self->param('use_given_ref')) {
46,836✔
1666
    $self->warning_msg("Transcript-assembly mismatch in ".$hash->{Uploaded_variation});
1667
  }
46,836✔
1668

46,836✔
1669
  if($pre->{within_feature}) {
1670

78,428✔
1671
    # exonic only
1672
    if($pre->{exon}) {
1673
      my $shifting_offset = $shift_length * $strand;
1674
      $hash->{cDNA_position}  = format_coords($tv->cdna_start(undef,$shifting_offset), $tv->cdna_end(undef,$shifting_offset));  
1675

86,476✔
1676
      $hash->{cDNA_position} .= '/'.$tr->length if $self->{total_length};
40✔
1677

40✔
1678
      # coding only
40✔
1679
      if($pre->{coding}) {
40✔
1680

1681
        $hash->{Amino_acids} = $vfoa->pep_allele_string;
1682
        $hash->{Codons}      = $vfoa->display_codon_allele_string;
86,476✔
1683
        $shifting_offset = 0 if defined($tv->{_boundary_shift}) && $tv->{_boundary_shift} == 1;
1684

1685
        $hash->{CDS_position}  = format_coords($tv->cds_start, $tv->cds_end);
1686
        $hash->{CDS_position} .= '/'.length($vep_cache->{translateable_seq})
1687
          if $self->{total_length} && $vep_cache->{translateable_seq};
1688

1689
        $hash->{Protein_position}  = format_coords($tv->translation_start(undef, $shifting_offset), $tv->translation_end(undef, $shifting_offset));
1690
        $hash->{Protein_position} .= '/'.length($vep_cache->{peptide})
1691
          if $self->{total_length} && $vep_cache->{peptide};
1692

1693
        $self->add_sift_polyphen($vfoa, $hash);
1694
      }
1695
    }
1696
    my $strand = $tr->strand() > 0 ? 1 : -1;
1697
    # HGVS
1698
    if($self->{hgvsc}) {
1699
      my $hgvs_t = $vfoa->hgvs_transcript(undef, !$self->param('shift_3prime'));
1700
      my $offset = $vfoa->hgvs_offset;
67,364✔
1701

67,364✔
1702
      $hash->{HGVSc} = $hgvs_t if $hgvs_t;
1703
      $hash->{HGVS_OFFSET} = $offset * $strand if $offset && $hgvs_t;
67,364✔
1704
    }
134,728✔
1705

1706
    if($self->{hgvsp}) {
134,728✔
1707
      $vfoa->{remove_hgvsp_version} = 1 if $self->{remove_hgvsp_version};
28,304✔
1708
      my $hgvs_p = $vfoa->hgvs_protein(undef, $self->{hgvsp_use_prediction});
28,304✔
1709
      my $offset = $vfoa->hgvs_offset;
28,304✔
1710

1711
      # URI encode "="
28,304✔
1712
      $hgvs_p =~ s/\=/\%3D/g if $hgvs_p && !$self->{no_escape};
28,288✔
1713

28,288✔
1714
      $hash->{HGVSp} = $hgvs_p if $hgvs_p;
1715
      $hash->{HGVS_OFFSET} = $offset * $strand if $offset && $hgvs_p;
1716
    }
28,304✔
1717
    $hash->{REFSEQ_OFFSET} = $vfoa->{refseq_misalignment_offset} if defined($vfoa->{refseq_misalignment_offset}) && $vfoa->{refseq_misalignment_offset} != 0;
1718
  }
28,304✔
1719

28,304✔
1720

28,304✔
1721

1722
  if($self->{use_transcript_ref}) {
28,304✔
1723
    my $ref_tva = $tv->get_reference_TranscriptVariationAllele;
1724
    $hash->{USED_REF} = $ref_tva->variation_feature_seq;
28,304✔
1725
    $hash->{USED_REF} = $ref_tva->{shift_hash}->{ref_orig_allele_string} if !$self->{shift_3prime} && defined($ref_tva->{shift_hash});
1726
    $hash->{GIVEN_REF} = $ref_tva->{given_ref};
18,216✔
1727
  }
18,208✔
1728

18,208✔
1729
  return $hash;
18,208✔
1730
}
1731

1732

18,216✔
1733
=head2 add_sift_polyphen
18,208✔
1734

1735
  Arg 1      : Bio::EnsEMBL::Variation::TranscriptVariationAllele $tva
18,208✔
1736
  Arg 2      : hashref $vf_hash
18,208✔
1737
  Example    : $hashref = $of->add_sift_polyphen($tva, $vf_hash);
18,200✔
1738
  Description: Adds SIFT and PolyPhen data to hash.
1739
  Returntype : hashref
1740
  Exceptions : none
8✔
1741
  Caller     : TranscriptVariationAllele_to_output_hash()
1742
  Status     : Stable
1743

1744
=cut
1745

1746
sub add_sift_polyphen {
1747
  my $self = shift;
28,304✔
1748
  my ($vfoa, $hash) = @_;
1749

1750
  foreach my $tool (qw(SIFT PolyPhen)) {
1751
    my $lc_tool = lc($tool);
67,364✔
1752

1753
    if (my $opt = $self->{$lc_tool}) {
1754
      my $want_pred  = $opt =~ /^p/i;
1755
      my $want_score = $opt =~ /^s/i;
1756
      my $want_both  = $opt =~ /^b/i;
1757

1758
      if ($want_both) {
1759
        $want_pred  = 1;
1760
        $want_score = 1;
1761
      }
1762

1763
      next unless $want_pred || $want_score;
1764

1765
      my $pred_meth  = $lc_tool.'_prediction';
1766
      my $score_meth = $lc_tool.'_score';
1767
      my $analysis   = $self->{polyphen_analysis} if $lc_tool eq 'polyphen';
1768

1769
      my $pred = $vfoa->$pred_meth($analysis);
824✔
1770

824✔
1771
      if($pred) {
1772

1773
        if ($want_pred) {
824✔
1774
          $pred =~ s/\s+/\_/g;
824✔
1775
          $pred =~ s/\_\-\_/\_/g;
1776
          $hash->{$tool} = $pred;
824✔
1777
        }
1778

824✔
1779
        if ($want_score) {
824✔
1780
          my $score = $vfoa->$score_meth($analysis);
824✔
1781

1782
          if(defined $score) {
824✔
1783
            if($want_pred) {
1784
              $hash->{$tool} .= "($score)";
824✔
1785
            }
1786
            else {
1787
              $hash->{$tool} = $score;
1788
            }
1789
          }
1790
        }
1791
      }
1792

1793
      # update stats
1794
      $self->stats->log_sift_polyphen($tool, $pred) if $pred && !$self->{no_stats};
1795
    }
1796
  }
1797

1798
  return $hash;
1799
}
1800

1801

1802
=head2 RegulatoryFeatureVariationAllele_to_output_hash
700✔
1803

700✔
1804
  Arg 1      : Bio::EnsEMBL::Variation::RegulatoryFeatureVariationAllele $rfva
1805
  Arg 2      : hashref $vf_hash
1806
  Example    : $hashref = $of->RegulatoryFeatureVariationAllele_to_output_hash($rfva, $vf_hash);
700✔
1807
  Description: Adds information to hash applicable to regulatory feature and allele combination.
700✔
1808
  Returntype : hashref
1809
  Exceptions : none
700✔
1810
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
1811
  Status     : Stable
1812

1813
=cut
700✔
1814

700✔
1815
sub RegulatoryFeatureVariationAllele_to_output_hash {
700✔
1816
  my $self = shift;
1817
  my ($vfoa, $hash) = @_;
700✔
1818

700✔
1819
  # run "super" method
700✔
1820
  $hash = $self->VariationFeatureOverlapAllele_to_output_hash(@_);
700✔
1821
  return undef unless $hash;
700✔
1822

700✔
1823
  my $rf = $vfoa->regulatory_feature;
700✔
1824

1,732✔
1825
  $hash->{Feature_type} = 'RegulatoryFeature';
1826
  $hash->{Feature}      = $rf->stable_id;
700✔
1827
  $hash->{CELL_TYPE}    = $self->get_cell_types($rf) if $self->{cell_type};
700✔
1828

700✔
1829
  $hash->{BIOTYPE} = ref($rf->{feature_type}) ? $rf->{feature_type}->{so_name} : $rf->{feature_type} if defined($rf->{feature_type});
700✔
1830

700✔
1831
  return $hash;
1832
}
700✔
1833

700✔
1834

1835
=head2 MotifFeatureVariationAllele_to_output_hash
700✔
1836

1837
  Arg 1      : Bio::EnsEMBL::Variation::MotifFeatureVariationAllele $mfva
1838
  Arg 2      : hashref $vf_hash
1839
  Example    : $hashref = $of->MotifFeatureVariationAllele_to_output_hash($mfva, $vf_hash);
1840
  Description: Adds information to hash applicable to motif feature and allele combination.
1841
  Returntype : hashref
1842
  Exceptions : none
1843
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
1844
  Status     : Stable
1845

1846
=cut
1847

1848
sub MotifFeatureVariationAllele_to_output_hash {
1849
  my $self = shift;
1850
  my ($vfoa, $hash) = @_;
1851

1852
  # run "super" method
1853
  $hash = $self->VariationFeatureOverlapAllele_to_output_hash(@_);
8✔
1854
  return undef unless $hash;
8✔
1855

1856
  my $mf = $vfoa->motif_feature;
1857

8✔
1858
  # check that the motif has a binding matrix, if not there's not
8✔
1859
  # much we can do so don't return anything
8✔
1860
  return undef unless defined $mf->get_BindingMatrix;
8✔
1861
  my $matrix = $mf->get_BindingMatrix;
1862
  my $matrix_id = $matrix->stable_id;
1863

1864
  my $mf_stable_id = $mf->stable_id;
1865
  $hash->{Feature_type} = 'MotifFeature';
1866
  $hash->{Feature}      = $mf_stable_id;
1867
  $hash->{MOTIF_NAME}   = $matrix_id;
1868
  my @transcription_factors = ();
1869
  my $associated_transcription_factor_complexes = $matrix->{associated_transcription_factor_complexes};
1870
  foreach my $tfc (@{$associated_transcription_factor_complexes}) {
1871
    push @transcription_factors, $tfc->{display_name};
1872
  }
1873
  $hash->{TRANSCRIPTION_FACTORS} = \@transcription_factors;
1874
  $hash->{STRAND}       = $mf->strand + 0;
1875
  $hash->{CELL_TYPE}    = $self->get_cell_types($mf) if $self->{cell_type};
1876
  $hash->{MOTIF_POS}    = $vfoa->motif_start if defined $vfoa->motif_start;
1877
  $hash->{HIGH_INF_POS} = ($vfoa->in_informative_position ? 'Y' : 'N');
1878

1879
  my $delta = $vfoa->motif_score_delta if $vfoa->variation_feature_seq =~ /^[ACGT]+$/;
1880
  $hash->{MOTIF_SCORE_CHANGE} = sprintf("%.3f", $delta) if defined $delta;
632✔
1881

632✔
1882
  return $hash;
632✔
1883
}
1884

1885

632✔
1886
=head2 get_cell_types
×
1887

×
1888
  Arg 1      : Bio::EnsEMBL::Funcgen::MotifFeature or Bio::EnsEMBL::Funcgen::RegulatoryFeature $ft
×
1889
  Example    : $cell_types = $of->get_cell_types($ft);
1890
  Description: Gets cell types for this regulatory or motif feature
1891
  Returntype : listref of strings
1892
  Exceptions : none
1893
  Caller     : MotifFeatureVariationAllele_to_output_hash(),
1894
               RegulatoryFeatureVariationAllele_to_output_hash()
632✔
1895
  Status     : Stable
1896

1897
=cut
1898

1899
sub get_cell_types {
1900
  my $self = shift;
1901
  my $ft = shift;
1902

1903
  return [
1904
    map {s/\s+/\_/g; $_}
1905
    map {$_.':'.$ft->{cell_types}->{$_}}
1906
    grep {$ft->{cell_types}->{$_}}
1907
    @{$self->{cell_type}}
1908
  ];
1909
}
1910

1911

1912
=head2 IntergenicVariationAllele_to_output_hash
1913

1914
  Arg 1      : Bio::EnsEMBL::Variation::IntergenicVariationAllele $iva
1915
  Arg 2      : hashref $vf_hash
1916
  Example    : $hashref = $of->IntergenicVariationAllele_to_output_hash($iva, $vf_hash);
1917
  Description: Just a placeholder really as no extra information is added for
76✔
1918
               intergenic variants.
76✔
1919
  Returntype : hashref
1920
  Exceptions : none
76✔
1921
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
1922
  Status     : Stable
76✔
1923

1924
=cut
1925

76✔
1926
sub IntergenicVariationAllele_to_output_hash {
1927
  my $self = shift;
76✔
1928
  my $iva = shift;
1929
  my $hash = shift;
1930
    
76✔
1931
  ## By default, shifting in the 3' direction will not update the 'location' field unless '--shift_genomic' is supplied
76✔
1932
  unless(!$self->{shift_3prime} || !$self->param('shift_genomic')) { #if shifting without shift genomic, we still want to run $iva->genomic_shift
1933
    $iva->genomic_shift;
1934
    if (defined($iva->{shift_hash}->{shift_length}) && $self->param('shift_genomic')) {
76✔
1935
      my $vf = $iva->variation_feature;
1936
      $hash->{Location} = ($vf->{chr} || $vf->seq_region_name).':'.
1937
        format_coords($vf->{start} + $iva->{shift_hash}->{shift_length}, $vf->{end} + $iva->{shift_hash}->{shift_length});
1938
    }
1939
  }
1940
  
1941
  return $self->VariationFeatureOverlapAllele_to_output_hash($iva, $hash, @_);
1942
}
76✔
1943

1944

76✔
1945
### SV-type methods
1946
###################
1947

1948

1949
=head2 BaseStructuralVariationOverlapAllele_to_output_hash
1950

1951
  Arg 1      : Bio::EnsEMBL::Variation::BaseStructuralVariationOverlapAllele $bsvoa
1952
  Arg 2      : hashref $vf_hash
1953
  Example    : $hashref = $of->BaseStructuralVariationOverlapAllele_to_output_hash($bsvoa, $vf_hash);
1954
  Description: Adds basic information to hash applicable to all
1955
               StructuralVariationOverlapAllele classes.
1956
  Returntype : hashref
1957
  Exceptions : none
1958
  Caller     : StructuralVariationOverlapAllele_to_output_hash()
1959
  Status     : Stable
1960

1961
=cut
1962

1963
sub BaseStructuralVariationOverlapAllele_to_output_hash {
64✔
1964
  my $self = shift;
64✔
1965
  my ($vfoa, $hash) = @_;
1966

1967
  my $svf = $vfoa->base_variation_feature;
64✔
1968

64✔
1969
  $hash->{Allele} = $vfoa->{breakend}->{string} || $svf->class_SO_term;
1970

64✔
1971
  # allele number
64✔
1972
  $hash->{ALLELE_NUM} = $vfoa->allele_number if $self->{allele_number};
1973

1974
  my @ocs = sort {$a->rank <=> $b->rank} @{$vfoa->get_all_OverlapConsequences};
64✔
1975

1976
  # consequence type(s)
64✔
1977
  my $term_method = $self->{terms}.'_term';
64✔
1978
  $hash->{Consequence} = [map {$_->$term_method || $_->SO_term} @ocs];
1979

1980
  # impact
64✔
1981
  $hash->{IMPACT} = $ocs[0]->impact() if @ocs;
64✔
1982

64✔
1983
  # allele number
64✔
1984
  # if($self->{allele_number}) {
64✔
1985
  #   $hash->{ALLELE_NUM} = $vfoa->allele_number if $vfoa->can('allele_number');
1986
  # }
64✔
1987

64✔
1988
  # picked?
1989
  $hash->{PICK} = 1 if defined($vfoa->{PICK});
1990

1991
  return $hash;
32✔
1992
}
32✔
1993

1994

64✔
1995
=head2 StructuralVariationOverlapAllele_to_output_hash
1996

1997
  Arg 1      : Bio::EnsEMBL::Variation::StructuralVariationOverlapAllele $svoa
1998
  Arg 2      : hashref $vf_hash
1999
  Example    : $hashref = $of->StructuralVariationOverlapAllele_to_output_hash($svoa, $vf_hash);
2000
  Description: Adds basic information to hash applicable to all
2001
               StructuralVariationOverlapAllele classes.
2002
  Returntype : hashref
2003
  Exceptions : none
2004
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
2005
  Status     : Stable
2006

2007
=cut
2008

2009
sub StructuralVariationOverlapAllele_to_output_hash {
2010
  my $self = shift;
2011
  my ($vfoa, $hash) = @_;
2012

40✔
2013
  # run "super" method
40✔
2014
  $hash = $self->BaseStructuralVariationOverlapAllele_to_output_hash(@_);
2015
  return undef unless $hash;
2016

40✔
2017
  my $feature = $vfoa->feature;
40✔
2018
  my $svf = $vfoa->base_variation_feature;
40✔
2019

2020
  # get feature type
40✔
2021
  my $feature_type = (split '::', ref($feature))[-1];
40✔
2022

40✔
2023
  $hash->{Feature_type} = $feature_type;
40✔
2024
  $hash->{Feature}      = $feature->stable_id;
2025

40✔
2026
  # work out overlap amounts (except for breakpoints)
2027
  if ($svf->class_SO_term !~ /breakpoint/) {
2028
    my $overlap_start  = (sort {$a <=> $b} ($svf->start, $feature->start))[-1];
28✔
2029
    my $overlap_end    = (sort {$a <=> $b} ($svf->end, $feature->end))[0];
2030
    my $overlap_length = ($overlap_end - $overlap_start) + 1;
28✔
2031
    my $overlap_pc     = 100 * ($overlap_length / (($feature->end - $feature->start) + 1));
28✔
2032

2033
    $hash->{OverlapBP} = $overlap_length if $overlap_length > 0;
2034
    $hash->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0;
28✔
2035
  }
2036

16✔
2037
  # cell types
8✔
2038
  $hash->{CELL_TYPE} = $self->get_cell_types($feature)
8✔
2039
    if $self->{cell_type} && $feature_type =~ /(Motif|Regulatory)Feature/;
2040

16✔
2041
  return $hash;
8✔
2042
}
8✔
2043

2044

2045
=head2 TranscriptStructuralVariationAllele_to_output_hash
2046

2047
  Arg 1      : Bio::EnsEMBL::Variation::TranscriptStructuralVariationAllele $tsva
40✔
2048
  Arg 2      : hashref $vf_hash
2049
  Example    : $hashref = $of->TranscriptStructuralVariationAllele_to_output_hash($tsva, $vf_hash);
2050
  Description: Adds transcript information to hash for SV overlaps.
2051
  Returntype : hashref
2052
  Exceptions : none
2053
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
2054
  Status     : Stable
2055

2056
=cut
2057

2058
sub TranscriptStructuralVariationAllele_to_output_hash {
2059
  my $self = shift;
2060
  my ($vfoa, $hash) = @_;
2061

2062
  # run "super" methods
2063
  $hash = $self->StructuralVariationOverlapAllele_to_output_hash(@_);
2064
  $hash = $self->BaseTranscriptVariationAllele_to_output_hash(@_);
2065
  return undef unless $hash;
2066

4✔
2067
  my $svo = $vfoa->base_variation_feature_overlap;
4✔
2068
  my $pre = $vfoa->_pre_consequence_predicates;
2069
  my $tr  = $svo->transcript;
2070
  my $vep_cache = $tr->{_variation_effect_feature_cache};
2071

2072
  if($pre->{within_feature}) {
2073

2074
    # exonic only
2075
    if($pre->{exon}) {
2076

2077
      $hash->{cDNA_position}  = format_coords($svo->cdna_start, $svo->cdna_end);
2078
      $hash->{cDNA_position} .= '/'.$tr->length if $self->{total_length};
2079

2080
      # coding only
2081
      if($pre->{coding}) {
2082

2083
        $hash->{CDS_position}  = format_coords($svo->cds_start, $svo->cds_end);
2084
        $hash->{CDS_position} .= '/'.length($vep_cache->{translateable_seq})
2085
          if $self->{total_length} && $vep_cache->{translateable_seq};
2086

2087
        $hash->{Protein_position}  = format_coords($svo->translation_start, $svo->translation_end);
89,192✔
2088
        $hash->{Protein_position} .= '/'.length($vep_cache->{peptide})
2089
          if $self->{total_length} && $vep_cache->{peptide};
2090
      }
2091
    }
2092
  }
2093

2094
  return $hash;
2095
}
2096

2097

2098
=head2 IntergenicStructuralVariationAllele_to_output_hash
2099

2100
  Arg 1      : Bio::EnsEMBL::Variation::IntergenicStructuralVariationAllele $isva
2101
  Arg 2      : hashref $vf_hash
2102
  Example    : $hashref = $of->IntergenicStructuralVariationAllele_to_output_hash($isva, $vf_hash);
2103
  Description: Just a placeholder really as no extra information is added for
2104
               intergenic variants.
2105
  Returntype : hashref
2106
  Exceptions : none
88,620✔
2107
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
2108
  Status     : Stable
88,620✔
2109

2110
=cut
88,620✔
2111

2112
sub IntergenicStructuralVariationAllele_to_output_hash {
2113
  my $self = shift;
2,992✔
2114
  return $self->BaseStructuralVariationOverlapAllele_to_output_hash(@_);
2115
}
2116

2,992✔
2117

2118
### Other methods
2,992✔
2119
#################
2,992✔
2120

2121

2,988✔
2122
=head2 plugins
2,984✔
2123

2,980✔
2124
  Example    : $plugins = $of->plugins();
2,980✔
2125
  Description: Gets all plugins to be run.
2126
  Returntype : listref of Bio::EnsEMBL::Variation::Utils::BaseVepPlugin
2127
  Exceptions : none
2128
  Caller     : run_plugins(), get_plugin_headers()
4✔
2129
  Status     : Stable
2130

2131
=cut
2132

2133
sub plugins {
4✔
2134
  return $_[0]->{plugins} || [];
2135
}
2136

2,992✔
2137

4✔
2138
=head2 run_plugins
2139

2140
  Arg 1      : Bio::EnsEMBL::Variation::BaseVariationFeatureOverlapAllele $bvfoa
2141
  Arg 2      : hashref $vf_hash
2142
  Arg 3      : Bio::EnsEMBL::Variation::VariationFeature $vf
2143
  Example    : $vf_hash = $of->run_plugins($bvfoa, $vf_hash, $vf);
2,992✔
2144
  Description: Runs all plugins for given BaseVariationFeatureOverlapAllele
2145
  Returntype : hashref
2146
  Exceptions : none
2147
  Caller     : get_all_VariationFeatureOverlapAllele_output_hashes()
2148
  Status     : Stable
88,620✔
2149

2150
=cut
2151

2152
sub run_plugins {
2153
  my ($self, $bvfoa, $line_hash, $vf) = @_;
2154

2155
  my $skip_line = 0;
2156

2157
  for my $plugin (@{ $self->plugins || [] }) {
2158

2159
    # check that this plugin is interested in this type of variation feature
2160
    if($plugin->check_variant_feature_type(ref($vf || $bvfoa->base_variation_feature))) {
2161

2162
      # check that this plugin is interested in this type of feature
2163
      if($plugin->check_feature_type(ref($bvfoa->feature) || 'Intergenic')) {
2164

2165
        eval {
2166
          my $plugin_results = $plugin->run($bvfoa, $line_hash);
24✔
2167

24✔
2168
          if (defined $plugin_results) {
2169
            if (ref $plugin_results eq 'HASH') {
24✔
2170
              for my $key (keys %$plugin_results) {
2171
                $line_hash->{$key} = $plugin_results->{$key};
2172
              }
24✔
2173
            }
24✔
2174
            else {
2175
              $self->warning_msg("Plugin '".(ref $plugin)."' did not return a hashref, output ignored!");
24✔
2176
            }
2177
          }
2178
          else {
48✔
2179
            # if a plugin returns undef, that means it want to filter out this line
2180
            $skip_line = 1;
2181
          }
28✔
2182
        };
2183
        if($@) {
28✔
2184
          $self->warning_msg("Plugin '".(ref $plugin)."' went wrong: $@");
28✔
2185
        }
28✔
2186

2187
        # there's no point running any other plugins if we're filtering this line,
28✔
2188
        # because the first plugin to skip the line wins, so we might as well last
2189
        # out of the loop now and avoid any unnecessary computation
2190
        last if $skip_line;
2191
      }
2192
    }
20✔
2193
  }
2194

2195
  return $skip_line ? undef : $line_hash;
20✔
2196
}
2197

2198

2199
=head2 rejoin_variants_in_InputBuffer
2200

2201
  Arg 1      : Bio::EnsEMBL::VEP::InputBuffer
2202
  Example    : $of->rejoin_variants_in_InputBuffer($ib);
2203
  Description: Rejoins linked variants that were split in InputBuffer, typically
20✔
2204
               by user providing --minimal flag.
60✔
2205
  Returntype : none
48✔
2206
  Exceptions : none
48✔
2207
  Caller     : get_all_output_hashes_by_InputBuffer()
2208
  Status     : Stable
48✔
2209

2210
=cut
2211

2212
sub rejoin_variants_in_InputBuffer {
2213
  my $self = shift;
2214
  my $buffer = shift;
2215

20✔
2216
  my @joined_list = ();
4✔
2217

4✔
2218
  # backup stat logging status
2219
  my $no_stats = $self->{no_stats};
4✔
2220
  $self->{no_stats} = 1;
4✔
2221

4✔
2222
  foreach my $vf(@{$buffer->buffer}) {
4✔
2223

2224
    # reset original one
4✔
2225
    # check $vf->{first} to get first $vf in split_variants
2226
    if(defined($vf->{first})) {
2227

2228
      # do consequence stuff
2229
      $self->get_all_output_hashes_by_VariationFeature($vf);
×
2230

2231
      $vf->{allele_string}    = $vf->{original_allele_string};
2232
      $vf->{seq_region_start} = $vf->{start} = $vf->{original_start};
2233
      $vf->{seq_region_end}   = $vf->{end}   = $vf->{original_end};
2234

20✔
2235
      push @joined_list, $vf;
2236
    }
2237

2238
    # this one needs to be merged in
2239
    elsif(defined($vf->{merge_with})) {
×
2240
      my $original = $vf->{merge_with};
2241

2242
      # do consequence stuff
2243
      $self->get_all_output_hashes_by_VariationFeature($vf);
24✔
2244

2245
      # now we have to copy the [Feature]Variation objects
24✔
2246
      # we can't simply copy the alleles as the coords will be different
2247
      # better to make new keys
2248
      # we also have to set the VF pointer to the original
2249

2250
      # copy transcript variations etc
2251
      foreach my $type(map {$_.'_variations'} qw(transcript motif_feature regulatory_feature)) {
2252
        foreach my $key(keys %{$vf->{$type} || {}}) {
2253
          my $val = $vf->{$type}->{$key};
2254
          $val->base_variation_feature($original);
2255
          # rename the key they're stored under
2256
          $original->{$type}->{$vf->{allele_string}.'_'.$key} = $val;
2257
        }
2258
      }
2259

2260
      # intergenic variation is a bit different
2261
      # there is only one, and no reference feature to key on
2262
      # means we have to copy over alleles manually
2263
      if(my $iv = $vf->{intergenic_variation}) {
2264
        my $bfvo = $original->{intergenic_variation};
2265
        $iv->base_variation_feature($original);
2266

1,016✔
2267
        if(my $oiv = $original->{intergenic_variation}) {
1,016✔
2268
          foreach my $alt (@{$iv->{alt_alleles}}) {
2269
            $alt->base_variation_feature_overlap($bfvo);
2270
            push @{$oiv->{alt_alleles}}, $alt;
2271
          }
2272
          $oiv->{_alleles_by_seq}->{$_->variation_feature_seq} = $_ for @{$oiv->{alt_alleles}};
2273
        }
2274

2275
        # this probably won't happen, but can't hurt to cover all bases
2276
        else {
2277
            $original->{intergenic_variation} = $iv;
2278
        }
2279
      }
2280

2281
      # reset these keys, they can be recalculated
2282
      delete $original->{$_} for qw(overlap_consequences _most_severe_consequence);
2283
    }
2284

4✔
2285
    # normal
2286
    else {
2287
      push @joined_list, $vf;
2288
    }
2289
  }
2290

2291
  $self->{no_stats} = $no_stats;
2292

2293
  $buffer->buffer(\@joined_list);
2294
}
2295

2296

2297
### Header etc methods
2298
### These will be called when generating the actual output
2299
##########################################################
2300

572✔
2301

2302
=head2 header_info
572✔
2303

2304
  Example    : $info = $of->header_info();
572✔
2305
  Description: Retrieve header info passed in on object creation
32✔
2306
  Returntype : hashref
32✔
2307
  Exceptions : none
32✔
2308
  Caller     : get_custom_headers(), sub-classes
2309
  Status     : Stable
2310

2311
=cut
2312

572✔
2313
sub header_info {
2314
  my $self = shift;
2315
  return $self->{header_info} || {};
2316
}
2317

2318

2319
=head2 headers
2320

2321
  Example    : $headers = $of->headers();
2322
  Description: Get list of headers to print out for this output format.
2323
               This is a stub method, overridden in child classes.
2324
  Returntype : listref of strings
2325
  Exceptions : none
2326
  Caller     : Runner
2327
  Status     : Stable
2328

580✔
2329
=cut
2330

580✔
2331
sub headers {
2332
  return [];
580✔
2333
}
2334

64✔
2335

64✔
2336
=head2 get_plugin_headers
2337

2338
  Example    : $headers = $of->get_plugin_headers();
64✔
2339
  Description: Get headers from plugins
2340
  Returntype : arrayref of arrayrefs [$key, $header]
64✔
2341
  Exceptions : none
4✔
2342
  Caller     : description_headers() in child classes
4✔
2343
  Status     : Stable
2344

30✔
2345
=cut
2346

30✔
2347
sub get_plugin_headers {
2348
  my $self = shift;
2349

2350
  my @headers = ();
64✔
2351

8✔
2352
  for my $plugin (@{$self->plugins}) {
8✔
2353
    if (my $hdr = $plugin->get_header_info) {
×
2354
      for my $key (sort keys %$hdr) {
×
2355
        push @headers, [$key, $hdr->{$key}];
2356
      }
2357
    }
2358
  }
2359

2360
  return \@headers;
2361
}
8✔
2362

2363

2364
=head2 get_custom_headers
2365

2366
  Example    : $headers = $of->get_custom_headers();
2367
  Description: Get headers from custom data files
2368
  Returntype : arrayref of arrayrefs [$key, $header]
64✔
2369
  Exceptions : none
×
2370
  Caller     : description_headers() in child classes
×
2371
  Status     : Stable
×
2372

×
2373
=cut
2374

×
2375
sub get_custom_headers {
2376
  my $self = shift;
2377

2378
  my @headers;
2379

2380
  foreach my $custom(@{$self->header_info->{custom_info} || []}) {
2381
    
2382
    my @flatten_header = get_flatten(\@headers);
580✔
2383
    my %pos = map { $flatten_header[$_]=~/o/ ? ($flatten_header[$_]=>$_) : () } 0..$#flatten_header if @flatten_header;
2384
    
2385
    # To prevent printing internal directory mask filepath
2386
    my $masked_file = $custom->{file} =~ /\// ? "[PATH]/" . (basename $custom->{file}) : $custom->{file};
2387

2388
    if (grep { /^$custom->{short_name}$/ }  @flatten_header){
2389
      my $pos = ($pos{$custom->{short_name}} || 0) / 2;
2390
      $headers[$pos][1] .= ",$masked_file";
2391
    } else {
2392
        push @headers, [ $custom->{short_name}, $masked_file ];
2393
    }
2394

2395
    foreach my $field(@{$custom->{fields} || []}) {
2396
      my $sub_id = sprintf("%s_%s", $custom->{short_name}, $field);
2397

792✔
2398
      # Determine the description to use:
2399
      # - If the custom annotation already contains a proper description for this field,
2400
      #   use that description
834✔
2401
      # - Otherwise, use a default string that says "<field> field from <masked_file>"
1,668✔
2402
      my $base_desc = exists $custom->{field_descriptions}->{$field}
1,224✔
2403
        ? $custom->{field_descriptions}->{$field}
2404
        : ucfirst($field);  # fallback if even no default desc
43,560✔
2405
      
2406
      my $desc = sprintf(
792✔
2407
        "%s. %s field from %s",
2408
        $base_desc,
2409
        $field,
2410
        $masked_file
2411
      );
792✔
2412

792✔
2413
      if (grep { /^$sub_id$/ } @flatten_header){
4,988✔
2414
        my $pos = $pos{$sub_id} / 2;
4,988✔
2415
        $headers[$pos][1] .= ",$masked_file";
2416
      } elsif ($field eq "PC") {
2417
        push @headers, [
792✔
2418
          $sub_id,
2419
          sprintf($custom->{overlap_def} . " from %s", $masked_file)
2420
        ];
2421
      
2422
      # Otherwise, add a new header row with the sub_id and the determined description above
2423
      } else {
2424
        push @headers, [ $sub_id, $desc ];
2425
      }
2426
    }
2427

2428
    foreach my $stat(@{$custom->{summary_stats} || []}) {
2429
      my $sub_id = sprintf("%s_%s", $custom->{short_name}, $stat);
2430
      if (grep { /^$sub_id$/ } @flatten_header){
2431
        my $pos = $pos{$sub_id} / 2;
2432
        $headers[$pos][1] .= ",$masked_file";
2433
      } else {
436✔
2434
        push @headers, [
2435
          $sub_id,
436✔
2436
          sprintf("%s data from %s", $stat, $masked_file)
2437
        ];
2438
      }
2439
    }
2440
  }
2441

2442
  return \@headers;
2443
}
2444

2445
=head2 flag_fields
2446

2447
  Example    : $fields = $of->flag_fields();
2448
  Description: Get list of fields that will appear in output given user config
2449
  Returntype : arrayref of strings
2450
  Exceptions : none
2451
  Caller     : fields() in child classes
2452
  Status     : Stable
2453

2454
=cut
2455

2456
sub flag_fields {
2457
  my $self = shift;
2458
  
2459
  # get all fields
2460
  my @tmp = (
2461
    map {@{$_->{fields}}}
2462
    map {$_->[0]}
2463
    grep {
2464
      ref($_->[1]) eq 'ARRAY' ? scalar @{$_->[1]} : $_->[1]
2465
    }
2466
    map {[$_, $self->param($_->{flag})]}
2467
    @Bio::EnsEMBL::VEP::Constants::FLAG_FIELDS
2468
  );
2469

2470
  # uniquify, retaining order
2471
  my (%seen, @return);
2472
  for(@tmp) {
2473
    push @return, $_ unless $seen{$_};
2474
    $seen{$_} = 1;
2475
  }
2476

2477
  return \@return;
2478
}
2479

2480

2481
=head2 get_full_command
2482

2483
  Example    : $headers = $of->get_full_command();
2484
  Description: Get headers from custom data files
2485
  Returntype : arrayref of arrayrefs [$key, $header]
2486
  Exceptions : none
2487
  Caller     : description_headers() in child classes
2488
  Status     : Stable
2489

2490
=cut
2491

2492
sub get_full_command {
2493
  my $self = shift;
2494

2495
  return $self->{_config}->{_params}->{full_command} || "";
2496
}
2497

2498
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc