• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl / #625445784

23 Aug 2024 05:53PM UTC coverage: 82.254%. First build
#625445784

Pull #716

travis-ci

Pull Request #716: Xref Changes and Fixes (release/114)

18 of 36 new or added lines in 2 files covered. (50.0%)

32808 of 39886 relevant lines covered (82.25%)

820.52 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

63.38
/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm
1
=head1 LICENSE
2

3
See the NOTICE file distributed with this work for additional information
4
regarding copyright ownership.
5

6
Licensed under the Apache License, Version 2.0 (the "License");
7
you may not use this file except in compliance with the License.
8
You may obtain a copy of the License at
9

10
     http://www.apache.org/licenses/LICENSE-2.0
11

12
Unless required by applicable law or agreed to in writing, software
13
distributed under the License is distributed on an "AS IS" BASIS,
14
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
See the License for the specific language governing permissions and
16
limitations under the License.
17

18
=cut
19

20
# Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs.
21
#
22
# Files actually contain both types of xref, distinguished by ID line;
23
#
24
# ID   CYC_PIG                 Reviewed;         104 AA.  Swissprot
25
# ID   Q3ASY8_CHLCH            Unreviewed;     36805 AA.  SPTrEMBL
26

27

28

29
package XrefParser::UniProtParser;
30

31
use strict;
1✔
32
use warnings;
1✔
33
use Carp;
1✔
34
use POSIX qw(strftime);
1✔
35
use File::Basename;
1✔
36
use Text::CSV;
1✔
37

38
use base qw( XrefParser::BaseParser );
1✔
39

40
my $hgnc_file;
41

42
sub run {
43

44
  my ($self, $ref_arg) = @_;
12✔
45
  my $source_id    = $ref_arg->{source_id};
12✔
46
  my $species_id   = $ref_arg->{species_id};
12✔
47
  my $species_name = $ref_arg->{species};
12✔
48
  my $files        = $ref_arg->{files};
12✔
49
  my $release_file = $ref_arg->{rel_file};
12✔
50
  my $verbose      = $ref_arg->{verbose};
12✔
51
  my $dbi          = $ref_arg->{dbi};
12✔
52
  $dbi = $self->dbi unless defined $dbi;
12✔
53

54
  $hgnc_file = $ref_arg->{hgnc_file} || undef;
12✔
55

56
  if((!defined $source_id) or (!defined $species_id) or (!defined $files)){
12✔
57
    croak "Need to pass source_id, species_id, files and rel_file as pairs";
×
58
  }
59
  $verbose |=0;
12✔
60

61
  my $file = @{$files}[0];
12✔
62

63
  my ( $sp_source_id, $sptr_source_id, $sp_release, $sptr_release, $sptr_non_display_source_id, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id );
12✔
64

65
  $sp_source_id =
12✔
66
    $self->get_source_id_for_source_name('Uniprot/SWISSPROT','sequence_mapped', $dbi);
67
  $sptr_source_id =
12✔
68
    $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'sequence_mapped', $dbi);
69

70
  $sptr_non_display_source_id =
12✔
71
    $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'protein_evidence_gt_2', $dbi);
72

73
  $sp_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SWISSPROT', 'direct', $dbi);
12✔
74
  $sptr_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'direct', $dbi);
12✔
75

76
  $isoform_source_id = $self->get_source_id_for_source_name('Uniprot_isoform');
12✔
77

78
  print "SwissProt source id for $file: $sp_source_id\n" if ($verbose);
12✔
79
  print "SpTREMBL source id for $file: $sptr_source_id\n" if ($verbose);
12✔
80
  print "SpTREMBL protein_evidence > 2 source id for $file: $sptr_non_display_source_id\n" if ($verbose);
12✔
81
  print "SwissProt direct source id for $file: $sp_direct_source_id\n" if ($verbose);
12✔
82
  print "SpTREMBL direct source id for $file: $sptr_direct_source_id\n" if ($verbose);
12✔
83
 
84
  $self->create_xrefs( $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id,
12✔
85
      $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi );
86

87
    if ( defined $release_file ) {
12✔
88
        # Parse Swiss-Prot and SpTrEMBL release info from
89
        # $release_file.
90
        my $release_io = $self->get_filehandle($release_file);
×
91
        while ( defined( my $line = $release_io->getline() ) ) {
×
92
            if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) {
×
93
                $sp_release = $1;
×
94
                print "Swiss-Prot release is '$sp_release'\n" if($verbose);
×
95
            } elsif ( $line =~ m#(UniProtKB/TrEMBL Release .*)# ) {
96
                $sptr_release = $1;
×
97
                print "SpTrEMBL release is '$sptr_release'\n" if($verbose);
×
98
            }
99
        }
100
        $release_io->close();
×
101

102
        # Set releases
103
        $self->set_release( $sp_source_id,        $sp_release, $dbi );
×
104
        $self->set_release( $sptr_source_id,      $sptr_release, $dbi );
×
105
        $self->set_release( $sptr_non_display_source_id, $sptr_release, $dbi );
×
106
        $self->set_release( $sp_direct_source_id, $sp_release, $dbi );
×
107
        $self->set_release( $sptr_direct_source_id,$sptr_release, $dbi );
×
108
    }
109

110

111
  return 0; # successfull
12✔
112
}
113

114

115
# --------------------------------------------------------------------------------
116
# Parse file into array of xref objects
117

118
sub create_xrefs {
119
  my ($self, $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id, $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi ) = @_;
12✔
120

121
  my $num_sp = 0;
12✔
122
  my $num_sptr = 0;
12✔
123
  my $num_sp_pred = 0;
12✔
124
  my $num_sptr_pred = 0;
12✔
125
  my $num_sptr_non_display = 0;
12✔
126
  my $num_direct_sp = 0;
12✔
127
  my $num_direct_sptr = 0;
12✔
128

129
  my %dependent_sources = $self->get_xref_sources($dbi);
12✔
130

131
  my (%genemap) =
132
    %{ $self->get_valid_codes( "mim_gene", $species_id, $dbi ) };
12✔
133
  my (%morbidmap) =
134
    %{ $self->get_valid_codes( "mim_morbid", $species_id, $dbi ) };
12✔
135

136
  # Extract descriptions from hgnc
137
  my %hgnc_descriptions;
12✔
138
  if ($hgnc_file) {
12✔
NEW
139
    %hgnc_descriptions = $self->get_hgnc_descriptions($hgnc_file);
×
140
  }
141

142
  my $uniprot_io = $self->get_filehandle($file);
12✔
143
  if ( !defined $uniprot_io ) { return }
12✔
144

145
  my @xrefs;
12✔
146

147
  local $/ = "//\n";
12✔
148

149
  # Create a hash of all valid taxon_ids for this species
150
  my %species2tax = $self->species_id2taxonomy($dbi);
12✔
151
  push @{$species2tax{$species_id}}, $species_id;
12✔
152
  my @tax_ids = @{$species2tax{$species_id}};
12✔
153
  my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids;
12✔
154

155
  my %dependent_xrefs;
12✔
156
  my $ensembl_derived_protein_count = 0;
12✔
157

158
  # Counter to process file in batches
159
  my $count = 0;
12✔
160

161
  while ( $_ = $uniprot_io->getline() ) {
12✔
162

163
    # if an OX line exists, only store the xref if the taxonomy ID that the OX
164
    # line refers to is in the species table
165
    # due to some records having more than one tax_id, we need to check them 
166
    # all and only proceed if one of them matches.
167
    #OX   NCBI_TaxID=158878, 158879;
168
    #OX   NCBI_TaxID=103690;
169

170
    my ($ox) = $_ =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;
12✔
171
    my @ox = ();
12✔
172
    my $found = 0;
12✔
173

174
    if ( defined $ox ) {
12✔
175
        @ox = split /\, /, $ox;
12✔
176

177
        # my %taxonomy2species_id = $self->taxonomy2species_id();
178

179
        foreach my $taxon_id_from_file (@ox) {
12✔
180
          $taxon_id_from_file =~ s/\s//;
12✔
181
          if ( exists $taxonomy2species_id{$taxon_id_from_file} ){
12✔
182
            $found = 1;
12✔
183
            $count++;
12✔
184
          }
185
        }
186
    }
187

188
    next if (!$found); # no taxon_id's match, so skip to next record
12✔
189
    my $xref;
12✔
190

191
    # set accession (and synonyms if more than one)
192
    # AC line may have primary accession and possibly several ; separated synonyms
193
    # May also be more than one AC line
194
    my ($acc) = $_ =~ /(\nAC\s+.+)/s; # will match first AC line and everything else
12✔
195

196
    my @all_lines = split /\n/, $acc;
12✔
197

198
    # Check for CC (caution) lines containing certain text
199
    # If sequence is from Ensembl, do not use
200
    my $ensembl_derived_protein = 0;
12✔
201
    if ($_ =~ /CAUTION: The sequence shown here is derived from an Ensembl/) {
12✔
202
      $ensembl_derived_protein = 1;
×
203
      $ensembl_derived_protein_count++;
×
204
    }
205

206
    # extract ^AC lines only & build list of accessions
207
    my @accessions;
12✔
208
    foreach my $line (@all_lines) {
12✔
209
      my ($accessions_only) = $line =~ /^AC\s+(.+)/;
539✔
210
      push(@accessions, (split /;\s*/, $accessions_only)) if ($accessions_only);
539✔
211

212
    }
213

214

215
    if(lc($accessions[0]) eq "unreviewed"){
12✔
216
      print "WARNING: entries with accession of $acc not allowed will be skipped\n";
×
217
      next;
×
218
    }
219
    $xref->{INFO_TYPE} = "SEQUENCE_MATCH";
12✔
220
    $xref->{ACCESSION} = $accessions[0];
12✔
221
    for (my $a=1; $a <= $#accessions; $a++) {
12✔
222
      push(@{$xref->{"SYNONYMS"} }, $accessions[$a]);
×
223
    }
224

225
    my ($label, $sp_type) = $_ =~ /ID\s+(\w+)\s+(\w+)/;
12✔
226
    my ($protein_evidence_code) = $_ =~ /PE\s+(\d+)/; 
12✔
227
    # Capture line with entry version
228
    # Example: DT   22-APR-2020, entry version 1.
229
    my ($version) = $_ =~ /DT\s+\d+-\w+-\d+, entry version (\d+)/;
12✔
230

231
    # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here
232
    if ($sp_type =~ /^Reviewed/i) {
12✔
233

234
      $xref->{SOURCE_ID} = $sp_source_id;
×
235
      $num_sp++;
×
236
    } elsif ($sp_type =~ /Unreviewed/i) {
237

238
    #Use normal source only if it is PE levels 1 & 2
239
      if (defined($protein_evidence_code) && $protein_evidence_code < 3) {
12✔
240
          $xref->{SOURCE_ID} = $sptr_source_id;
×
241
          $num_sptr++;
×
242
      } else {
243
          $xref->{SOURCE_ID} = $sptr_non_display_source_id;
12✔
244
          $num_sptr_non_display++;          
12✔
245
      }
246

247
    } else {
248

249
      next; # ignore if it's neither one nor t'other
×
250

251
    }
252

253

254

255
    # some straightforward fields
256
    # the previous $label flag of type BRCA2_HUMAN is not used in Uniprot any more, use accession instead
257
    $xref->{LABEL} = $accessions[0] ."." . $version;
12✔
258
    $xref->{VERSION} = $version;
12✔
259
    $xref->{SPECIES_ID} = $species_id;
12✔
260
    $xref->{SEQUENCE_TYPE} = 'peptide';
12✔
261
    $xref->{STATUS} = 'experimental';
12✔
262

263
    # May have multi-line descriptions
264
    my ($description_and_rest) = $_ =~ /(DE\s+.*)/s;
12✔
265
    @all_lines = split /\n/, $description_and_rest;
12✔
266

267
    # extract ^DE lines only & build cumulative description string
268
    my $description = "";
12✔
269
    my $name        = "";
12✔
270
    my $sub_description = "";
12✔
271

272
    foreach my $line (@all_lines) {
12✔
273

274
      next if(!($line =~ /^DE/));
479✔
275

276
      # get the data
277
      if($line =~ /^DE   RecName: Full=(.*);/){
12✔
278
        $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
×
279
        $name .= $1;
×
280
      }
281
      elsif($line =~ /RecName: Full=(.*);/){
282
        $description .= ' ' if $description ne q{}; #separate the description bit with just a space
×
283
        $description .= $1;
×
284
      }
285
      elsif($line =~ /SubName: Full=(.*);/){
286
        $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
12✔
287
        $name .= $1;
12✔
288
      }
289

290

291
      $description =~ s/^\s*//g;
12✔
292
      $description =~ s/\s*$//g;
12✔
293

294
      
295
      my $desc = $name.' '.$description;
12✔
296
      if(!length($desc)){
12✔
297
        $desc = $sub_description;
×
298
      }
299
      
300
      $desc =~ s/\s*\{ECO:.*?\}//g;
12✔
301
      $xref->{DESCRIPTION} = $desc;
12✔
302

303
      # Parse the EC_NUMBER line, only for S.cerevisiae for now
304
      
305
      if (($line =~ /EC=/) && ($species_id == 4932)) {
12✔
306

307
          #print STDERR "EC Number line: $line\n";
308
          
309
          $line =~ /^DE\s+EC=([^;]+);/;
×
310
          
311
          # Get the EC Number and make it an xref for S.cer if any
312
          
313
          my $EC = $1;
×
314
          
315
          #print STDERR "EC after processing: $EC\n";
316
          
317
          my %depe;
×
318
          $depe{LABEL} = $EC;
×
319
          $depe{ACCESSION} = $EC;
×
320
          
321
          $depe{SOURCE_NAME} = "EC_NUMBER";
×
322
          
323
          $depe{SOURCE_ID} = $dependent_sources{"EC_NUMBER"};
×
324
          $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
×
325
          push @{$xref->{DEPENDENT_XREFS}}, \%depe;
×
326
          $dependent_xrefs{"EC_NUMBER"}++;
×
327
      }
328

329
    }
330

331
    # extract sequence
332
    my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline
12✔
333
      my @seq_lines = split /\n/, $seq;
12✔
334
    my $parsed_seq = "";
12✔
335
    foreach my $x (@seq_lines) {
12✔
336
      $parsed_seq .= $x;
72✔
337
    }
338
    $parsed_seq =~ s/\/\///g;   # remove trailing end-of-record character
12✔
339
    $parsed_seq =~ s/\s//g;     # remove whitespace
12✔
340
    $parsed_seq =~ s/^.*;//g;   # remove everything before last ;
12✔
341

342
    $xref->{SEQUENCE} = $parsed_seq;
12✔
343
    #print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";
344

345
    
346
    my ($gns) = $_ =~ /(GN\s+.+)/s;
12✔
347
    my @gn_lines = ();
12✔
348
    if (defined $gns) {
12✔
349
      foreach my $gn_line (split("\n", $gns)) {
12✔
350
        if ($gn_line !~ /^GN/) {last;}
48✔
351

352
        $gn_line =~ s/^GN\s+//g;
36✔
353
        push(@gn_lines, $gn_line);
36✔
354
      }
355

356
      $gns = join('', @gn_lines);
12✔
357
      @gn_lines = split /;/, $gns;
12✔
358
    }
359
  
360
    # Do not allow the addition of UniProt Gene Name dependent Xrefs
361
    # if the protein was imported from Ensembl. Otherwise we will
362
    # re-import previously set symbols
363
    if(! $ensembl_derived_protein) {
12✔
364
      my %depe;
12✔
365
      foreach my $gn (@gn_lines){
12✔
366
        my $gene_name = undef;
12✔
367

368
        if ($gn =~ /Name=([A-Za-z0-9_\-\.\s:]+)/s) { #/s for multi-line entries ; is the delimiter
12✔
369
# Example line 
370
# GN   Name=ctrc {ECO:0000313|Xenbase:XB-GENE-5790348};
371
          my $name = $1;
×
372
          $name =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
×
373
          $depe{LABEL} = $name; # leave name as is, upper/lower case is relevant in gene names
×
374
          $depe{ACCESSION} = $self->get_name($xref->{ACCESSION},$depe{LABEL});
×
375
          $gene_name = $depe{ACCESSION};
×
376

377
          $depe{SOURCE_NAME} = "Uniprot_gn";
×
378
          $depe{SOURCE_ID} = $dependent_sources{"Uniprot_gn"};
×
379
          $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
×
NEW
380
          $depe{DESCRIPTION} = $hgnc_descriptions{$name} if ($hgnc_file && defined($hgnc_descriptions{$name}));
×
381
          push @{$xref->{DEPENDENT_XREFS}}, \%depe;
×
382
          $dependent_xrefs{"Uniprot_gn"}++;
×
383
        }
384
        my @syn;
12✔
385
        if($gn =~ /Synonyms=(.*)/s){ # use of /s as synonyms can be across more than one line
12✔
386
# Example line
387
# GN   Synonyms=cela2a {ECO:0000313|Ensembl:ENSXETP00000014934},
388
# GN   MGC79767 {ECO:0000313|EMBL:AAH80976.1}
389
          my $syn = $1;
×
390
          $syn =~ s/{.*}//g;  # Remove any potential evidence codes
×
391
          $syn =~ s/\n//g;    # Remove return carriages, as entry can span several lines
×
392
          $syn =~ s/\s+$//g;  # Remove white spaces that are left over at the end if there was an evidence code
×
393
          #$syn =~ s/^\s+//g;  # Remove white spaces that are left over at the beginning if there was an evidence code
394
          $syn =~ s/\s+,/,/g;  # Remove white spaces that are left over before the comma if there was an evidence code
×
395
          @syn = split(/, /,$syn);
×
396
          push (@{$depe{"SYNONYMS"}}, @syn);
×
397
        }
398
      }
399
    }
400

401
    # dependent xrefs - only store those that are from sources listed in the source table
402
    my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline
12✔
403

404
    my @dep_lines = ();
12✔
405
    if ( defined $deps ) { @dep_lines = split /\n/, $deps }
12✔
406

407
    my %seen=();  # per record basis
12✔
408

409
    foreach my $dep (@dep_lines) {
12✔
410
      #both GO and UniGene have the own sources so ignore those in the uniprot files
411
      #as the uniprot data should be older
412
      if($dep =~ /GO/ || $dep =~ /UniGene/){
215✔
413
        next;
12✔
414
      }
415
      if ($dep =~ /^DR\s+(.+)/) {
203✔
416
        my ($source, $acc, @extra) = split /;\s*/, $1;
95✔
417
        if($source =~ "RGD"){  #using RGD file now instead.
95✔
418
                next;
×
419
              }
420
        if($source =~ "CCDS"){
95✔
421
          next;
×
422
        }
423
              if($source =~ "IPI"){
95✔
424
                next;
×
425
              }
426
              if($source =~ "UCSC"){
95✔
427
                next;
×
428
              }
429
              if($source =~ "SGD"){
95✔
430
                next;
×
431
              }
432
              if($source =~ "HGNC"){
95✔
433
                next;
×
434
              }
435
        # We get the mappings directly from the source
436
        if($source =~ "MGI"){
95✔
437
          next;
×
438
        }
439
        # Nomenclature data is imported directly from the source
440
        if($source =~ "VGNC"){
95✔
441
          next;
×
442
        }
443
              if($source =~ "Orphanet"){
95✔
444
                #we don't want to parse Orphanet xrefs via Uniprot, we get them from Orphanet with descriptions
445
                next;
×
446
              }
447
              if($source =~ "ArrayExpress"){
95✔
448
                  next;
×
449
              }
450
        if($source =~ "GenomeRNAi" || $source =~ "EPD"){
95✔
451
            next;
×
452
        }
453
        if($source =~ "Xenbase"){
95✔
454
            next;
×
455
        }
456
# Uniprot get Reactome links from Reactome, so we want to get the info from Reactome directly
457
        if($source =~ "Reactome"){
95✔
458
            next;
×
459
        }
460
# MIM xrefs are already imported separately, ignore from Uniprot
461
# Also, Uniprot deals with proteins, not appropriate for gene level xrefs
462
        if ($source =~ "MIM_GENE" || $source =~ "MIM_MORBID" || $source =~ "MIM") {
95✔
463
            next;
×
464
        }
465
        # GeneCards xrefs are imported through the HGNC file
466
        if ($source =~ "GeneCards") {
95✔
467
          next;
×
468
        }
469
# If mapped to Ensembl, add as direct xref
470
        if ($source eq "Ensembl") {
95✔
471
# Example line:
472
# DR   Ensembl; ENST00000380152; ENSP00000369497; ENSG00000139618.
473
# DR   Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1]
474
# $source is Ensembl, $acc is ENST00000380152 and @extra is the rest of the line
475
# If the UniProt accession is repeated here, it links to a specific isoform
476
          my %direct;
×
477
          my $isoform;
478

479
          my $stable_id = $extra[0];
×
480
          $stable_id =~ s/\.[0-9]+//;
×
481
          $direct{STABLE_ID} = $stable_id;
×
482
          $direct{ENSEMBL_TYPE} = 'Translation';
×
483
          $direct{LINKAGE_TYPE} = 'DIRECT';
×
484
          if ($xref->{SOURCE_ID} == $sp_source_id) {
×
485
            $direct{SOURCE_ID} = $sp_direct_source_id;
×
486
            $num_direct_sp++;
×
487
          } else {
488
            $direct{SOURCE_ID} = $sptr_direct_source_id;
×
489
            $num_direct_sptr++;
×
490
          }
491
          push @{$xref->{DIRECT_XREFS}}, \%direct;
×
492

493
          my $uniprot_acc = $accessions[0];
×
494
          if ($extra[1] =~ /($accessions[0]-[0-9]+)/) {
×
495
            $isoform = $1;
×
496
            $self->add_to_direct_xrefs({
×
497
              stable_id  => $stable_id,
498
              type       => 'translation',
499
              acc        => $isoform,
500
              label      => $isoform,
501
              dbi        => $dbi,
502
              source_id  => $isoform_source_id,
503
              linkage    => 'DIRECT',
504
              species_id => $species_id
505
            });
506
          }
507
        }
508
           if (exists $dependent_sources{$source} ) {
95✔
509
          # create dependent xref structure & store it
510
          my %dep;
19✔
511
          $dep{SOURCE_NAME} = $source;
19✔
512
          $dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
19✔
513
          $dep{SOURCE_ID} = $dependent_sources{$source};
19✔
514

515
          if($source =~ /HGNC/){
19✔
516
            $acc =~ s/HGNC://;
×
517
            $extra[0] =~ s/[.]//;
×
518
            $dep{LABEL} = $extra[0];
×
519
          }
520
          $dep{ACCESSION} = $acc;
19✔
521

522
#          $dep{ACCESSION} = $acc;
523
          $dependent_xrefs{ $dep{SOURCE_NAME} }++; # get count of depenent xrefs.
19✔
524
          if(!defined($seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}})){
19✔
525
            push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
17✔
526
            $seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}} =1;
17✔
527
          }
528
          if($dep =~ /EMBL/ && !($dep =~ /ChEMBL/)){
19✔
529
            my ($protein_id) = $extra[0];
13✔
530
            if(($protein_id ne "-") and (!defined($seen{$source.":".$protein_id}))){
13✔
531
              my %dep2;
13✔
532
              $dep2{SOURCE_NAME} = $source;
13✔
533
              $dep2{SOURCE_ID} = $dependent_sources{"protein_id"};
13✔
534
              $dep2{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
13✔
535
              # store accession unversioned
536
              $dep2{LABEL} = $protein_id;
13✔
537
              my ($prot_acc, $prot_version) = $protein_id =~ /([^.]+)\.([^.]+)/;
13✔
538
              $dep2{ACCESSION} = $prot_acc;
13✔
539
              $dependent_xrefs{ $dep2{SOURCE_NAME} }++; # get count of dependent xrefs.
13✔
540
              $seen{$source.":".$protein_id} = 1;
13✔
541
              push @{$xref->{DEPENDENT_XREFS}}, \%dep2; # array of hashrefs
13✔
542
            }
543
          }
544
        }
545
      }
546
    }
547

548
    push @xrefs, $xref;
12✔
549

550
    if ($count > 1000) {
12✔
551
      $self->upload_xref_object_graphs(\@xrefs, $dbi);
×
552
      $count = 0;
×
553
      undef @xrefs;
×
554
    }
555

556
  }
557

558
  $self->upload_xref_object_graphs(\@xrefs, $dbi) if scalar(@xrefs) > 0;
12✔
559

560
  $uniprot_io->close();
12✔
561

562
  print "Read $num_sp SwissProt xrefs, $num_sptr SPTrEMBL xrefs with protein evidence codes 1-2, and $num_sptr_non_display SPTrEMBL xrefs with protein evidence codes > 2 from $file\n" if($verbose);
12✔
563
  print "Added $num_direct_sp direct SwissProt xrefs and $num_direct_sptr direct SPTrEMBL xrefs\n" if ($verbose);
12✔
564
  print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose);
12✔
565
  print "Skipped $ensembl_derived_protein_count ensembl annotations as Gene names\n";
12✔
566

567

568
#  print "$kount gene anmes added\n";
569

570
  print "Added the following dependent xrefs:-\n" if($verbose);
12✔
571
  foreach my $key (keys %dependent_xrefs){
12✔
572
    print $key."\t".$dependent_xrefs{$key}."\n" if($verbose);
17✔
573
  }
574
  print "End.\n" if ($verbose);
12✔
575

576
  #TODO - currently include records from other species - filter on OX line??
577
}
578

579
sub get_name {
580
  my $self = shift;
×
581
  my $acc  = shift;
×
582
  my $label = shift;
×
583

584
  return $acc;
×
585
}
586

587
sub get_hgnc_descriptions {
NEW
588
  my ($self, $hgnc_file) = @_;
×
NEW
589
  my %descriptions;
×
590

NEW
591
  my $hgnc_fh = $self->get_filehandle($hgnc_file);
×
NEW
592
  if ( !defined $hgnc_fh ) {confess "Can't open HGNC file '$hgnc_file'\n";}
×
NEW
593
  $hgnc_file = do { local $/; <$hgnc_fh> };
×
594

NEW
595
  my $input_file = Text::CSV->new({
×
596
    sep_char       => "\t",
597
    empty_is_undef => 1,
598
    binary         => 1,
599
    auto_diag      => 1
600
  }) or croak "Cannot use file $hgnc_file: ".Text::CSV->error_diag ();
601

NEW
602
  $hgnc_file = Encode::encode("UTF-8", $hgnc_file);
×
NEW
603
  $hgnc_file =~ s/"//xg;
×
604

NEW
605
  open my $hgnc_io, '<', \$hgnc_file or confess "Can't open HGNC file: $!\n";
×
606

NEW
607
  $input_file->column_names( @{ $input_file->getline( $hgnc_io ) } );
×
608

NEW
609
  while ( my $data = $input_file->getline_hr( $hgnc_io ) ) {
×
NEW
610
    my $gene_name   = $data->{'Approved symbol'};
×
NEW
611
    my $description = $data->{'Approved name'};
×
612

NEW
613
    $descriptions{$gene_name} = $description;
×
614
  }
615

NEW
616
  close $hgnc_io;
×
617

NEW
618
  return %descriptions;
×
619
}
620

621
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc