• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl-vep / #610835618

02 Oct 2023 09:04AM UTC coverage: 98.187%. First build
#610835618

Pull #1470

travis-ci

Pull Request #1470: Custom annotations: summary statistics + filter by number of records

139 of 139 new or added lines in 9 files covered. (100.0%)

11320 of 11529 relevant lines covered (98.19%)

64059.97 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.99
/modules/Bio/EnsEMBL/VEP/AnnotationSource/File.pm
1
=head1 LICENSE
2

3
Copyright [2016-2025] EMBL-European Bioinformatics Institute
4

5
Licensed under the Apache License, Version 2.0 (the "License");
6
you may not use this file except in compliance with the License.
7
You may obtain a copy of the License at
8

9
     http://www.apache.org/licenses/LICENSE-2.0
10

11
Unless required by applicable law or agreed to in writing, software
12
distributed under the License is distributed on an "AS IS" BASIS,
13
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
See the License for the specific language governing permissions and
15
limitations under the License.
16

17
=cut
18

19

20
=head1 CONTACT
21

22
 Please email comments or questions to the public Ensembl
23
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
24

25
 Questions may also be sent to the Ensembl help desk at
26
 <http://www.ensembl.org/Help/Contact>.
27

28
=cut
29

30
# EnsEMBL module for Bio::EnsEMBL::VEP::AnnotationSource::File
31
#
32
#
33

34
=head1 NAME
35

36
Bio::EnsEMBL::VEP::AnnotationSource::File - file-based annotation source
37

38
=head1 SYNOPSIS
39

40
# actually returns a Bio::EnsEMBL::VEP::AnnotationSource::File::VCF
41
my $as = Bio::EnsEMBL::VEP::AnnotationSource::File->new({
42
  config => $config,
43
  file   => "some_variants.vcf.gz",
44
  format => 'vcf',
45
  type   => 'exact'
46
});
47

48
$as->annotate_InputBuffer($ib);
49

50
=head1 DESCRIPTION
51

52
Base class for all custom file-based AnnotationSource classes.
53

54
Most child classes use ensembl-io parsers to read data from a custom annotation
55
file. This means they operate on a seek() and next() method of finding data in
56
the file.
57

58
=head1 METHODS
59

60
=cut
61

62

63
use strict;
152✔
64
use warnings;
152✔
65

66
package Bio::EnsEMBL::VEP::AnnotationSource::File;
67

68
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
152✔
69
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
152✔
70

71
use base qw(Bio::EnsEMBL::VEP::AnnotationSource);
152✔
72

73
our ($CAN_USE_TABIX_PM, $CAN_USE_BIGWIG);
74

75
BEGIN {
76
  if (eval q{ require Bio::DB::HTS::Tabix; 1 }) {
152✔
77
    $CAN_USE_TABIX_PM = 1;
152✔
78

79
    eval q{
152✔
80
      use Bio::EnsEMBL::VEP::AnnotationSource::File::BED;
81
      use Bio::EnsEMBL::VEP::AnnotationSource::File::VCF;
82
      use Bio::EnsEMBL::VEP::AnnotationSource::File::GFF;
83
      use Bio::EnsEMBL::VEP::AnnotationSource::File::GTF;
84
    };
85
  }
86

87
  if (eval q{ require Bio::DB::BigFile; 1 }) {
152✔
88
    $CAN_USE_BIGWIG = 1;
152✔
89

90
    eval q{
152✔
91
      use Bio::EnsEMBL::VEP::AnnotationSource::File::BigWig;
92
    };
93
  }
94
}
95

96
my %FORMAT_MAP = (
97
  'vcf'     => 'VCF',
98
  'gff'     => 'GFF',
99
  'gtf'     => 'GTF',
100
  'bed'     => 'BED',
101
  'bigwig'  => 'BigWig',
102
);
103

104
my %VALID_TYPES = (
105
  'overlap' => 1,
106
  'within' => 1,
107
  'surrounding' => 1,
108
  'exact' => 1,
109
);
110

111

112
=head2 new
113

114
  Arg 1      : hashref $args
115
               {
116
                 config         => Bio::EnsEMBL::VEP::Config $config,
117
                 file           => string $filename,
118
                 format         => string $format (bed,bigwig,gff,gtf,vcf),
119
                 short_name     => (optional) string $short_name,
120
                 type           => (optional) string $type (overlap (default), within, surrounding, exact),
121
                 report_coords  => (optional) bool $report_coords,
122
                 overlap_cutoff => (optional) numeric $minimum_percentage_overlap (0 by default),
123
                 distance       => (optional) numeric $distance_to_overlapping_variant_ends (off by default),
124
                 same_type      => (optional) bool $only_match_identical_variant_classes (off by default),
125
                 reciprocal     => (optional) bool $calculate_reciprocal_overlap (off by default),
126
                 overlap_def    => (optional) string $overlap_definition (based on reciprocal by default),
127
                 num_records    => (optional) maximum number of records to show (50 by default),
128
                 summary_stats  => (optional) summary statistics: max, min, mean, count, sum
129
               }
130
  Example    : $as = Bio::EnsEMBL::VEP::AnnotationSource::File->new($args);
131
  Description: Create a new Bio::EnsEMBL::VEP::AnnotationSource::File object. Will
132
               actually return a sub-class depending on the specified format.
133
  Returntype : Bio::EnsEMBL::VEP::AnnotationSource::File
134
  Exceptions : throws if:
135
                - no file given
136
                - invalid format specified
137
                - required module not installed (Bio::DB::HTS::Tabix or Bio::DB::BigFile)
138
  Caller     : AnnotationSourceAdaptor
139
  Status     : Stable
140

141
=cut
142

143
sub new {
144
  my $caller = shift;
428✔
145
  my $class = ref($caller) || $caller;
428✔
146
  
147
  my $self = $class->SUPER::new(@_);
428✔
148

149
  my $hashref = $_[0];
428✔
150

151
  throw("ERROR: No file given\n") unless $hashref->{file};
428✔
152
  $self->file($hashref->{file});
424✔
153

154
  $hashref->{short_name} = $self->short_name($hashref->{short_name} || (
424✔
155
      $hashref->{gff_type} && $hashref->{gff_type} eq "gencode_promoter" ? 
424✔
156
        "GENCODE_PROMOTER" :
420✔
157
        split '/', $self->file)[-1]
158
    );
420✔
159
  $hashref->{type} = $self->type($hashref->{type} || 'overlap');
420✔
160
  $self->report_coords(defined($hashref->{report_coords}) ? $hashref->{report_coords} : 0);
420✔
161

420✔
162
  $self->{overlap_cutoff} = $hashref->{overlap_cutoff} || 0;
420✔
163
  $self->{distance}       = $hashref->{distance};
420✔
164
  $self->{same_type}      = $hashref->{same_type}      || 0;
165
  $self->{reciprocal}     = $hashref->{reciprocal}     || 0;
420✔
166
  $self->{overlap_def}    = $hashref->{overlap_def};
167
  $self->{num_records}    = defined $hashref->{num_records} ? $hashref->{num_records} : 50;
420✔
168
  $self->{gff_type}       = $hashref->{gff_type}       || "transcript";
169

180✔
170
  $self->{info} = { custom_info => $hashref };
171

180✔
172
  if ($self->{gff_type} eq "gencode_promoter") {
180✔
173
    $self->{info}->{custom_info}->{fields} = [qw/type feature_id associated_gene/];
174
  }
172✔
175

24✔
176
  if(my $format = $hashref->{format}) {
177

178
    delete $hashref->{format};
148✔
179

180
    $format = lc($format);
164✔
181
    throw("ERROR: Unknown or unsupported format $format\n") unless $FORMAT_MAP{$format};
182

164✔
183
    if($format eq 'bigwig') {
164✔
184
      throw("ERROR: Cannot use format $format without Bio::DB::BigFile module installed\n") unless $CAN_USE_BIGWIG;
164✔
185
    }
186
    else {
187
      throw("ERROR: Cannot use format $format without Bio::DB::HTS::Tabix module installed\n") unless $CAN_USE_TABIX_PM;
240✔
188
    }
189
    $hashref->{_format} = $format;
190

191
    my $class = $self->module_prefix.'::AnnotationSource::File::'.$FORMAT_MAP{$format};
192
    eval "require $class";
193
    return $class->new({%$hashref, config => $self->config});
194
  }
195

196
  return $self;
197
}
198

199

200
=head2 file
201

202
  Arg 1      : (optional) string $file
203
  Example    : $file = $as->file()
204
  Description: Getter/setter for filename for this source.
808✔
205
  Returntype : string
808✔
206
  Exceptions : none
808✔
207
  Caller     : general
208
  Status     : Stable
209

210
=cut
211

212
sub file {
213
  my $self = shift;
214
  $self->{file} = shift if @_;
215
  return $self->{file};
216
}
217

218

219
=head2 short_name
220

221
  Arg 1      : (optional) string $short_name
222
  Example    : $short_name = $as->short_name()
223
  Description: Getter/setter for short name for this source, used as the key
224
               in VEP output.
2,424✔
225
  Returntype : string
2,424✔
226
  Exceptions : none
2,424✔
227
  Caller     : general
228
  Status     : Stable
229

230
=cut
231

232
sub short_name {
233
  my $self = shift;
234
  $self->{short_name} = shift if @_;
235
  return $self->{short_name};
236
}
237

238

239
=head2 type
240

241
  Arg 1      : (optional) string $type
242
  Example    : $type = $as->type()
243
  Description: Getter/setter for type of this source:
244
                - "overlap" returns source features that overlap input variants
245
                - "within" returns source features that are within the input variants
246
                - "surrounding" returns source features that completely surround the input variants
247
                - "exact" requires that source features' coordinates match input variants exactly
856✔
248
  Returntype : string
249
  Exceptions : none
856✔
250
  Caller     : general
472✔
251
  Status     : Stable
472✔
252

464✔
253
=cut
254

255
sub type {
848✔
256
  my $self = shift;
257

258
  if(@_) {
259
    my $new_type = shift;
260
    throw("ERROR: New type \"$new_type\" is not valid\n") unless $VALID_TYPES{$new_type};
261
    $self->{type} = $new_type;
262
  }
263

264
  return $self->{type};
265
}
266

267

268
=head2 report_coords
269

270
  Arg 1      : (optional) bool $report_coords
271
  Example    : $report_coords = $as->report_coords()
272
  Description: Getter/setter for the report_coords parameter. If set to a true value,
273
               the coordinates of the source feature are returned in place of any
274
               identifier found in the source for the feature.
660✔
275
  Returntype : bool
660✔
276
  Exceptions : none
660✔
277
  Caller     : general
278
  Status     : Stable
279

280
=cut
281

282
sub report_coords {
283
  my $self = shift;
284
  $self->{report_coords} = shift if @_;
285
  return $self->{report_coords};
286
}
287

288

289
=head2 annotate_InputBuffer
290

291
  Arg 1      : Bio::EnsEMBL::VEP::InputBuffer
292
  Example    : $as->annotate_InputBuffer($ib);
293
  Description: Gets overlapping features from the source for the variants in
294
               the input buffer, and adds them as references to the  relevant
295
               VariationFeature.
164✔
296
  Returntype : none
164✔
297
  Exceptions : none
298
  Caller     : Runner
164✔
299
  Status     : Stable
164✔
300

301
=cut
164✔
302

303
sub annotate_InputBuffer {
304
  my $self = shift;
305
  my $buffer = shift;
160✔
306

307
  my %by_chr;
160✔
308
  push @{$by_chr{$_->{chr}}}, $_ for @{$buffer->buffer};
160✔
309

2,284✔
310
  my $parser = $self->parser();
311

2,284✔
312
  # (Big)BED and (Big)Wig have the method 'get_raw_chrom' in ensembl-io
2,284✔
313
  # VCF and GXF (GFF and GTF) have the method 'get_raw_seqname' in ensembl-io
2,280✔
314
  my $get_raw_seqname = $parser->can('get_raw_seqname') ? 'get_raw_seqname' : 'get_raw_chrom';
315

316
  foreach my $chr(keys %by_chr) {
2,284✔
317
    foreach my $vf(@{$by_chr{$chr}}) {
318
      next if $vf->{vep_skip}; # avoid annotating previously skipped variants
319

320
      my ($vf_start, $vf_end) = ($vf->{start}, $vf->{end});
321
      if ($parser->seek($self->get_source_chr_name($chr), $vf_start - 1, $vf_end + 1)) {
322
        $parser->next();
323
      }
2,284✔
324

2,284✔
325
      my $stats = $self->{summary_stats};
326
      # Different checks before annotating the VF:
327
      # - Check a record exist
328
      # - Check that the chromosomes names match between the input VF entry and the custom annotation file.
256✔
329
      #   There is a special case for the custom annotation file with chromosome names like 'chr1',
256✔
330
      #   as they are not present in the cache (chr_synonym.txt): we also check the 'raw' seqname with the source_chr_name.
256✔
331
      # - Check that the start of the custom annotation record is lower that the end of the input VF entry
256✔
332
      my $record_count = 0;
333
      while($parser->{record} &&
334
            ($parser->get_seqname eq $self->get_source_chr_name($chr) || $parser->${get_raw_seqname} eq $self->get_source_chr_name($chr)) &&
335
            $parser->get_start <= $vf_end + 1) {
2,284✔
336
        # stop if exceeding the desired number of records and not calculating stats
36✔
337
        last if !defined $stats && $record_count > $self->{num_records};
36✔
338
        my $res = $self->annotate_VariationFeature($vf, $record_count);
36✔
339
        $record_count++ if $res;
16✔
340
        $parser->next();
341
      }
36✔
342

36✔
343
      # prepare statistics
344
      if (defined $stats) {
345
        my $annot_stats = $vf->{_custom_annotations_stats}->{$self->short_name};
346
        $annot_stats->{count} = $record_count if grep(/^count$/, @$stats);
347
        if ( grep(/^(mean)$/, @$stats) && defined $annot_stats->{sum} ) {
348
          $annot_stats->{mean} = $annot_stats->{sum} / $record_count;
349
        }
350
        delete $annot_stats->{sum} unless grep(/^sum$/, @$stats);
351
        $vf->{_custom_annotations_stats}->{$self->short_name} = $annot_stats;
352
      }
353
    }
354
  }
355
}
356

357

358
=head2 valid_chromosomes
359

360
  Example    : $chrs = $as->valid_chromosomes();
361
  Description: Gets valid chromosome names for this source
324✔
362
  Returntype : arrayref of strings
324✔
363
  Exceptions : none
364
  Caller     : Runner
365
  Status     : Stable
366

367
=cut
368

369
sub valid_chromosomes {
370
  my $self = shift;
371
  return $self->{valid_chromosomes} ||= $self->parser->{tabix_file}->seqnames;
372
}
373

374

375
=head2 annotate_VariationFeature
376
 
377
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature
378
  Arg 2      : $record_count
379
  Example    : $as->annotate_VariationFeature($vf);
380
  Description: Add custom annotations to the given variant using the
381
               current record as read from the annotation source.
256✔
382
  Returntype : none
256✔
383
  Exceptions : none
256✔
384
  Caller     : annotate_InputBuffer()
385
  Status     : Stable
256✔
386

387
=cut
256✔
388

389
sub annotate_VariationFeature {
212✔
390
  my $self = shift;
212✔
391
  my $vf = shift;
212✔
392
  my $record_count = shift;
393

212✔
394
  my ($overlap_result, $overlap_percentage) = $self->_record_overlaps_VF($vf);
212✔
395

20✔
396
  return unless ($overlap_result);
20✔
397

16✔
398
  my $stats      = $self->{summary_stats};
16✔
399
  my $get_scores = defined $stats;
×
400
  my $record = $self->_create_records($overlap_result, $get_scores);
×
401

402
  return unless @$record;
403

404
  my $is_recorded = 0;
405
  if (@{$record}[0]->{'name'} =~  /^COSV/) {
406
    my ($matched_cosmic_record) = grep{$_->{'name'} eq @{$record}[0]->{'name'}} @{$vf->{_custom_annotations}->{$self->short_name}};
212✔
407
    if ($matched_cosmic_record){
196✔
408
      $is_recorded = 1;
×
409
      foreach my $key (keys %{@{$record}[0]->{'fields'}}) {
410
        unless (exists $matched_cosmic_record->{'fields'}->{$key}) {
411
          $matched_cosmic_record->{'fields'}{$key} = @{$record}[0]->{'fields'}->{$key};
412
        }   
196✔
413
      }
196✔
414
    }
196✔
415
  }
40✔
416

40✔
417
  if (!$is_recorded) {
418
    if (defined($self->{fields}) && grep {$_ eq "PC"} @{$self->{fields}}) {
40✔
419
      $record->[0]->{"fields"}->{"PC"} = $overlap_percentage;
28✔
420
    }
421

40✔
422
    # calculate summary statistics for custom annotation
40✔
423
    my $annot_stats = $vf->{_custom_annotations_stats}->{$self->short_name};
424
    my $value = $record->[0]->{score} if defined $stats;
425
    if (defined $value) {
196✔
426
      if ( grep(/^min$/, @$stats) ) {
196✔
427
        $annot_stats->{min} = $value if $value < ($annot_stats->{min} || '+inf');
428
      }
×
429
      if ( grep(/^max$/, @$stats) ) {
430
        $annot_stats->{max} = $value if $value > ($annot_stats->{max} || '-inf');
431
      }
212✔
432
      $annot_stats->{sum} += $value if grep(/^(sum|mean)$/, @$stats);
433
      $vf->{_custom_annotations_stats}->{$self->short_name} = $annot_stats;
434
    }
435

436
    if ( $record_count < $self->{num_records} ) {
437
      push @{$vf->{_custom_annotations}->{$self->short_name}}, @{$record};
438
    } elsif ( $record_count == $self->{num_records} ) {
439
      push @{$vf->{_custom_annotations}->{$self->short_name}}, { name => '...' };
440
    }
441
  }
442
  return 1;
443
}
444

445

446
=head2 _create_records
447
 
448
  Arg 1      : bool or hashref $overlap_result
449
  Arg 2      : bool $get_scores
450
  Example    : $records = $as->_create_records();
60✔
451
  Description: Create a custom annotation record from the current
60✔
452
               record as read from the annotation source.
60✔
453
  Returntype : arrayref of hashrefs
454
  Exceptions : none
60✔
455
  Caller     : annotate_VariationFeature()
60✔
456
  Status     : Stable
60✔
457

458
=cut
459

460
sub _create_records {
461
  my $self           = shift;
462
  my $overlap_result = shift;
463
  my $get_scores     = shift;
464

465
  my $record = [{ name  => $self->_get_record_name }];
466
  $record->[0]->{score} = $self->parser->get_score if $get_scores;
467
  return $record;
468
}
469

470

471
=head2 _get_record_name
472
 
473
  Example    : $record_name = $as->_get_record_name();
28✔
474
  Description: Get name for the current record using either ID as
28✔
475
               found in the source or record coordinates.
476
  Returntype : string
28✔
477
  Exceptions : none
478
  Caller     : _create_records()
28✔
479
  Status     : Stable
480

481
=cut
482

483
sub _get_record_name {
484
  my $self = shift;
485
  my $parser = $self->parser;
486

487
  my $name = $parser->can("get_name") ? $parser->get_name : undef;
488

489
  return ($self->report_coords || !defined($name)) ?
490
    sprintf(
491
      '%s:%i-%i',
492
      $parser->get_seqname,
493
      $parser->get_start,
494
      $parser->get_end
495
    ) :
496
    $name;
497
}
498

499

500
=head2 _record_overlaps_VF
501
 
502
  Arg 1      : Bio::EnsEMBL::Variation::VariationFeature
503
  Example    : $overlap_ok = $as->_record_overlaps_VF($vf);
100✔
504
  Description: Determine whether the given VariationFeature overlaps
100✔
505
               the current record, depending on the set type()
506
  Returntype : bool
100✔
507
  Exceptions : none
100✔
508
  Caller     : annotate_VariationFeature()
100✔
509
  Status     : Stable
100✔
510

100✔
511
=cut
512

100✔
513
sub _record_overlaps_VF {
100✔
514
  my $self = shift;
515
  my $vf = shift;
100✔
516

517
  my $parser = $self->parser();
60✔
518
  my $type = $self->type();
60✔
519
  my $overlap_cutoff = $self->{overlap_cutoff};
60✔
520
  my $distance = $self->{distance};
521
  my $reciprocal = $self->{reciprocal};
522

60✔
523
  my ($ref_start, $ref_end) = ($parser->get_start, $parser->get_end);
524
  $ref_start += 1 if defined $self->{_format} && $self->{_format} eq 'bigwig';
525

60✔
526
  if($type eq 'overlap' || $type eq 'within' || $type eq 'surrounding') {
527
    # account for insertions in Ensembl world where s = e+1
60✔
528
    my ($vs, $ve) = ($vf->{start}, $vf->{end});
×
529
    ($vs, $ve) = ($ve, $vs) if $vs > $ve;
×
530
    my $length = $ve - $vs + 1;
531

532
    # check if reference variant is within the input variant (if enabled)
533
    return 0 if $type eq "within" && ($vs > $ref_start || $ve < $ref_end);
60✔
534

60✔
535
    # check if reference variant completely surrounds the input variant (if enabled)
60✔
536
    return 0 if $type eq "surrounding" && ($vs < $ref_start || $ve > $ref_end);
537

60✔
538
    if (defined $distance) {
539
      return 0 unless abs($vs - $ref_start) <= $distance;
60✔
540
      return 0 unless abs($ve - $ref_end  ) <= $distance;
541
    }
×
542

×
543
    # check overlap percentage
×
544
    my @overlap_start = sort { $a <=> $b } ($vs, $ref_start);
545
    my @overlap_end   = sort { $a <=> $b } ($ve, $ref_end);
546
    my $overlap_percentage = $length != 0 ? 100 * (1 + $overlap_end[0] - $overlap_start[1]) / $length : 100;
×
547

×
548
    return 0 if $overlap_percentage < $overlap_cutoff;
549

550
    if ($reciprocal) {
551
      # check bi-directional overlap - percentage of reference variant covered
60✔
552
      my $ref_length = $ref_end - $ref_start + 1;
60✔
553

554
      # in some cases $ref_length can be 0, for example in old VCF with single base deletion without INFO/SVLEN
555
      my $ref_overlap_percentage = $ref_length != 0 ? 100 * (1 + $overlap_end[0] - $overlap_start[1]) / $ref_length : 100;
40✔
556
      return 0 if $ref_overlap_percentage < $overlap_cutoff;
40✔
557

40✔
558
      # report minimum overlap
559
      if ($ref_overlap_percentage < $overlap_percentage) {
560
        $overlap_percentage = $ref_overlap_percentage;
561
      }
562
    }
563

564
    $overlap_percentage = sprintf("%.3f", $overlap_percentage);
565
    return overlap($ref_start, $ref_end, $vs, $ve), $overlap_percentage;
566
  }
567
  elsif($type eq 'exact') {
568
    my $match = $ref_start == $vf->{start} && $ref_end == $vf->{end};
569
    my $overlap_percentage = $match ? 100 : 0;
570
    return ( $match, $overlap_percentage );
571
  }
572
}
573

574
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc