#625445784

Committed 23 Aug 2024 05:53PM UTC coverage: 82.254%. First build
Build # #625445784
Build Type
Pull #716
travis-ci
Pull Request Pull Request #716: Xref Changes and Fixes (release/114)
Run Details
18 of 36 new or added lines in 2 files covered. (50.0%)
32808 of 39886 relevant lines covered (82.25%)
820.52 hits per line
Source File
Press 'n' to go to next uncovered line, 'b' for previous

63.38
/misc-scripts/xref_mapping/XrefParser/UniProtParser.pm
=head1 LICENSE

See the NOTICE file distributed with this work for additional information
regarding copyright ownership.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut

# Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs.
#
# Files actually contain both types of xref, distinguished by ID line;
#
# ID   CYC_PIG                 Reviewed;         104 AA.  Swissprot
# ID   Q3ASY8_CHLCH            Unreviewed;     36805 AA.  SPTrEMBL



package XrefParser::UniProtParser;

use strict;
use warnings;
use Carp;
use POSIX qw(strftime);
use File::Basename;
use Text::CSV;

use base qw( XrefParser::BaseParser );

my $hgnc_file;

sub run {

  my ($self, $ref_arg) = @_;
  my $source_id    = $ref_arg->{source_id};
  my $species_id   = $ref_arg->{species_id};
  my $species_name = $ref_arg->{species};
  my $files        = $ref_arg->{files};
  my $release_file = $ref_arg->{rel_file};
  my $verbose      = $ref_arg->{verbose};
  my $dbi          = $ref_arg->{dbi};
  $dbi = $self->dbi unless defined $dbi;

  $hgnc_file = $ref_arg->{hgnc_file} || undef;

  if((!defined $source_id) or (!defined $species_id) or (!defined $files)){
    croak "Need to pass source_id, species_id, files and rel_file as pairs";
  }
  $verbose |=0;

  my $file = @{$files}[0];

  my ( $sp_source_id, $sptr_source_id, $sp_release, $sptr_release, $sptr_non_display_source_id, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id );

  $sp_source_id =
    $self->get_source_id_for_source_name('Uniprot/SWISSPROT','sequence_mapped', $dbi);
  $sptr_source_id =
    $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'sequence_mapped', $dbi);

  $sptr_non_display_source_id =
    $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'protein_evidence_gt_2', $dbi);

  $sp_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SWISSPROT', 'direct', $dbi);
  $sptr_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'direct', $dbi);

  $isoform_source_id = $self->get_source_id_for_source_name('Uniprot_isoform');

  print "SwissProt source id for $file: $sp_source_id\n" if ($verbose);
  print "SpTREMBL source id for $file: $sptr_source_id\n" if ($verbose);
  print "SpTREMBL protein_evidence > 2 source id for $file: $sptr_non_display_source_id\n" if ($verbose);
  print "SwissProt direct source id for $file: $sp_direct_source_id\n" if ($verbose);
  print "SpTREMBL direct source id for $file: $sptr_direct_source_id\n" if ($verbose);
 
  $self->create_xrefs( $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id,
      $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi );

    if ( defined $release_file ) {
        # Parse Swiss-Prot and SpTrEMBL release info from
        # $release_file.
        my $release_io = $self->get_filehandle($release_file);
        while ( defined( my $line = $release_io->getline() ) ) {
            if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) {
                $sp_release = $1;
                print "Swiss-Prot release is '$sp_release'\n" if($verbose);
            } elsif ( $line =~ m#(UniProtKB/TrEMBL Release .*)# ) {
                $sptr_release = $1;
                print "SpTrEMBL release is '$sptr_release'\n" if($verbose);
            }
        }
        $release_io->close();

        # Set releases
        $self->set_release( $sp_source_id,        $sp_release, $dbi );
        $self->set_release( $sptr_source_id,      $sptr_release, $dbi );
        $self->set_release( $sptr_non_display_source_id, $sptr_release, $dbi );
        $self->set_release( $sp_direct_source_id, $sp_release, $dbi );
        $self->set_release( $sptr_direct_source_id,$sptr_release, $dbi );
    }


  return 0; # successfull
}


# --------------------------------------------------------------------------------
# Parse file into array of xref objects

sub create_xrefs {
  my ($self, $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id, $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi ) = @_;

  my $num_sp = 0;
  my $num_sptr = 0;
  my $num_sp_pred = 0;
  my $num_sptr_pred = 0;
  my $num_sptr_non_display = 0;
  my $num_direct_sp = 0;
  my $num_direct_sptr = 0;

  my %dependent_sources = $self->get_xref_sources($dbi);

  my (%genemap) =
    %{ $self->get_valid_codes( "mim_gene", $species_id, $dbi ) };
  my (%morbidmap) =
    %{ $self->get_valid_codes( "mim_morbid", $species_id, $dbi ) };

  # Extract descriptions from hgnc
  my %hgnc_descriptions;
  if ($hgnc_file) {
    %hgnc_descriptions = $self->get_hgnc_descriptions($hgnc_file);
  }

  my $uniprot_io = $self->get_filehandle($file);
  if ( !defined $uniprot_io ) { return }

  my @xrefs;

  local $/ = "//\n";

  # Create a hash of all valid taxon_ids for this species
  my %species2tax = $self->species_id2taxonomy($dbi);
  push @{$species2tax{$species_id}}, $species_id;
  my @tax_ids = @{$species2tax{$species_id}};
  my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids;

  my %dependent_xrefs;
  my $ensembl_derived_protein_count = 0;

  # Counter to process file in batches
  my $count = 0;

  while ( $_ = $uniprot_io->getline() ) {

    # if an OX line exists, only store the xref if the taxonomy ID that the OX
    # line refers to is in the species table
    # due to some records having more than one tax_id, we need to check them 
    # all and only proceed if one of them matches.
    #OX   NCBI_TaxID=158878, 158879;
    #OX   NCBI_TaxID=103690;

    my ($ox) = $_ =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;
    my @ox = ();
    my $found = 0;

    if ( defined $ox ) {
        @ox = split /\, /, $ox;

        # my %taxonomy2species_id = $self->taxonomy2species_id();

        foreach my $taxon_id_from_file (@ox) {
          $taxon_id_from_file =~ s/\s//;
          if ( exists $taxonomy2species_id{$taxon_id_from_file} ){
            $found = 1;
            $count++;
          }
        }
    }

    next if (!$found); # no taxon_id's match, so skip to next record
    my $xref;

    # set accession (and synonyms if more than one)
    # AC line may have primary accession and possibly several ; separated synonyms
    # May also be more than one AC line
    my ($acc) = $_ =~ /(\nAC\s+.+)/s; # will match first AC line and everything else

    my @all_lines = split /\n/, $acc;

    # Check for CC (caution) lines containing certain text
    # If sequence is from Ensembl, do not use
    my $ensembl_derived_protein = 0;
    if ($_ =~ /CAUTION: The sequence shown here is derived from an Ensembl/) {
      $ensembl_derived_protein = 1;
      $ensembl_derived_protein_count++;
    }

    # extract ^AC lines only & build list of accessions
    my @accessions;
    foreach my $line (@all_lines) {
      my ($accessions_only) = $line =~ /^AC\s+(.+)/;
      push(@accessions, (split /;\s*/, $accessions_only)) if ($accessions_only);

    }


    if(lc($accessions[0]) eq "unreviewed"){
      print "WARNING: entries with accession of $acc not allowed will be skipped\n";
      next;
    }
    $xref->{INFO_TYPE} = "SEQUENCE_MATCH";
    $xref->{ACCESSION} = $accessions[0];
    for (my $a=1; $a <= $#accessions; $a++) {
      push(@{$xref->{"SYNONYMS"} }, $accessions[$a]);
    }

    my ($label, $sp_type) = $_ =~ /ID\s+(\w+)\s+(\w+)/;
    my ($protein_evidence_code) = $_ =~ /PE\s+(\d+)/; 
    # Capture line with entry version
    # Example: DT   22-APR-2020, entry version 1.
    my ($version) = $_ =~ /DT\s+\d+-\w+-\d+, entry version (\d+)/;

    # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here
    if ($sp_type =~ /^Reviewed/i) {

      $xref->{SOURCE_ID} = $sp_source_id;
      $num_sp++;
    } elsif ($sp_type =~ /Unreviewed/i) {

    #Use normal source only if it is PE levels 1 & 2
      if (defined($protein_evidence_code) && $protein_evidence_code < 3) {
          $xref->{SOURCE_ID} = $sptr_source_id;
          $num_sptr++;
      } else {
          $xref->{SOURCE_ID} = $sptr_non_display_source_id;
          $num_sptr_non_display++;          
      }

    } else {

      next; # ignore if it's neither one nor t'other

    }



    # some straightforward fields
    # the previous $label flag of type BRCA2_HUMAN is not used in Uniprot any more, use accession instead
    $xref->{LABEL} = $accessions[0] ."." . $version;
    $xref->{VERSION} = $version;
    $xref->{SPECIES_ID} = $species_id;
    $xref->{SEQUENCE_TYPE} = 'peptide';
    $xref->{STATUS} = 'experimental';

    # May have multi-line descriptions
    my ($description_and_rest) = $_ =~ /(DE\s+.*)/s;
    @all_lines = split /\n/, $description_and_rest;

    # extract ^DE lines only & build cumulative description string
    my $description = "";
    my $name        = "";
    my $sub_description = "";

    foreach my $line (@all_lines) {

      next if(!($line =~ /^DE/));

      # get the data
      if($line =~ /^DE   RecName: Full=(.*);/){
        $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
        $name .= $1;
      }
      elsif($line =~ /RecName: Full=(.*);/){
        $description .= ' ' if $description ne q{}; #separate the description bit with just a space
        $description .= $1;
      }
      elsif($line =~ /SubName: Full=(.*);/){
        $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
        $name .= $1;
      }


      $description =~ s/^\s*//g;
      $description =~ s/\s*$//g;

      
      my $desc = $name.' '.$description;
      if(!length($desc)){
        $desc = $sub_description;
      }
      
      $desc =~ s/\s*\{ECO:.*?\}//g;
      $xref->{DESCRIPTION} = $desc;

      # Parse the EC_NUMBER line, only for S.cerevisiae for now
      
      if (($line =~ /EC=/) && ($species_id == 4932)) {

          #print STDERR "EC Number line: $line\n";
          
          $line =~ /^DE\s+EC=([^;]+);/;
          
          # Get the EC Number and make it an xref for S.cer if any
          
          my $EC = $1;
          
          #print STDERR "EC after processing: $EC\n";
          
          my %depe;
          $depe{LABEL} = $EC;
          $depe{ACCESSION} = $EC;
          
          $depe{SOURCE_NAME} = "EC_NUMBER";
          
          $depe{SOURCE_ID} = $dependent_sources{"EC_NUMBER"};
          $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
          push @{$xref->{DEPENDENT_XREFS}}, \%depe;
          $dependent_xrefs{"EC_NUMBER"}++;
      }

    }

    # extract sequence
    my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline
      my @seq_lines = split /\n/, $seq;
    my $parsed_seq = "";
    foreach my $x (@seq_lines) {
      $parsed_seq .= $x;
    }
    $parsed_seq =~ s/\/\///g;   # remove trailing end-of-record character
    $parsed_seq =~ s/\s//g;     # remove whitespace
    $parsed_seq =~ s/^.*;//g;   # remove everything before last ;

    $xref->{SEQUENCE} = $parsed_seq;
    #print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";

    
    my ($gns) = $_ =~ /(GN\s+.+)/s;
    my @gn_lines = ();
    if (defined $gns) {
      foreach my $gn_line (split("\n", $gns)) {
        if ($gn_line !~ /^GN/) {last;}

        $gn_line =~ s/^GN\s+//g;
        push(@gn_lines, $gn_line);
      }

      $gns = join('', @gn_lines);
      @gn_lines = split /;/, $gns;
    }
  
    # Do not allow the addition of UniProt Gene Name dependent Xrefs
    # if the protein was imported from Ensembl. Otherwise we will
    # re-import previously set symbols
    if(! $ensembl_derived_protein) {
      my %depe;
      foreach my $gn (@gn_lines){
        my $gene_name = undef;

        if ($gn =~ /Name=([A-Za-z0-9_\-\.\s:]+)/s) { #/s for multi-line entries ; is the delimiter
# Example line 
# GN   Name=ctrc {ECO:0000313|Xenbase:XB-GENE-5790348};
          my $name = $1;
          $name =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
          $depe{LABEL} = $name; # leave name as is, upper/lower case is relevant in gene names
          $depe{ACCESSION} = $self->get_name($xref->{ACCESSION},$depe{LABEL});
          $gene_name = $depe{ACCESSION};

          $depe{SOURCE_NAME} = "Uniprot_gn";
          $depe{SOURCE_ID} = $dependent_sources{"Uniprot_gn"};
          $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
          $depe{DESCRIPTION} = $hgnc_descriptions{$name} if ($hgnc_file && defined($hgnc_descriptions{$name}));
          push @{$xref->{DEPENDENT_XREFS}}, \%depe;
          $dependent_xrefs{"Uniprot_gn"}++;
        }
        my @syn;
        if($gn =~ /Synonyms=(.*)/s){ # use of /s as synonyms can be across more than one line
# Example line
# GN   Synonyms=cela2a {ECO:0000313|Ensembl:ENSXETP00000014934},
# GN   MGC79767 {ECO:0000313|EMBL:AAH80976.1}
          my $syn = $1;
          $syn =~ s/{.*}//g;  # Remove any potential evidence codes
          $syn =~ s/\n//g;    # Remove return carriages, as entry can span several lines
          $syn =~ s/\s+$//g;  # Remove white spaces that are left over at the end if there was an evidence code
          #$syn =~ s/^\s+//g;  # Remove white spaces that are left over at the beginning if there was an evidence code
          $syn =~ s/\s+,/,/g;  # Remove white spaces that are left over before the comma if there was an evidence code
          @syn = split(/, /,$syn);
          push (@{$depe{"SYNONYMS"}}, @syn);
        }
      }
    }

    # dependent xrefs - only store those that are from sources listed in the source table
    my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline

    my @dep_lines = ();
    if ( defined $deps ) { @dep_lines = split /\n/, $deps }

    my %seen=();  # per record basis

    foreach my $dep (@dep_lines) {
      #both GO and UniGene have the own sources so ignore those in the uniprot files
      #as the uniprot data should be older
      if($dep =~ /GO/ || $dep =~ /UniGene/){
        next;
      }
      if ($dep =~ /^DR\s+(.+)/) {
        my ($source, $acc, @extra) = split /;\s*/, $1;
        if($source =~ "RGD"){  #using RGD file now instead.
                next;
              }
        if($source =~ "CCDS"){
          next;
        }
              if($source =~ "IPI"){
                next;
              }
              if($source =~ "UCSC"){
                next;
              }
              if($source =~ "SGD"){
                next;
              }
              if($source =~ "HGNC"){
                next;
              }
        # We get the mappings directly from the source
        if($source =~ "MGI"){
          next;
        }
        # Nomenclature data is imported directly from the source
        if($source =~ "VGNC"){
          next;
        }
              if($source =~ "Orphanet"){
                #we don't want to parse Orphanet xrefs via Uniprot, we get them from Orphanet with descriptions
                next;
              }
              if($source =~ "ArrayExpress"){
                  next;
              }
        if($source =~ "GenomeRNAi" || $source =~ "EPD"){
            next;
        }
        if($source =~ "Xenbase"){
            next;
        }
# Uniprot get Reactome links from Reactome, so we want to get the info from Reactome directly
        if($source =~ "Reactome"){
            next;
        }
# MIM xrefs are already imported separately, ignore from Uniprot
# Also, Uniprot deals with proteins, not appropriate for gene level xrefs
        if ($source =~ "MIM_GENE" || $source =~ "MIM_MORBID" || $source =~ "MIM") {
            next;
        }
        # GeneCards xrefs are imported through the HGNC file
        if ($source =~ "GeneCards") {
          next;
        }
# If mapped to Ensembl, add as direct xref
        if ($source eq "Ensembl") {
# Example line:
# DR   Ensembl; ENST00000380152; ENSP00000369497; ENSG00000139618.
# DR   Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1]
# $source is Ensembl, $acc is ENST00000380152 and @extra is the rest of the line
# If the UniProt accession is repeated here, it links to a specific isoform
          my %direct;
          my $isoform;

          my $stable_id = $extra[0];
          $stable_id =~ s/\.[0-9]+//;
          $direct{STABLE_ID} = $stable_id;
          $direct{ENSEMBL_TYPE} = 'Translation';
          $direct{LINKAGE_TYPE} = 'DIRECT';
          if ($xref->{SOURCE_ID} == $sp_source_id) {
            $direct{SOURCE_ID} = $sp_direct_source_id;
            $num_direct_sp++;
          } else {
            $direct{SOURCE_ID} = $sptr_direct_source_id;
            $num_direct_sptr++;
          }
          push @{$xref->{DIRECT_XREFS}}, \%direct;

          my $uniprot_acc = $accessions[0];
          if ($extra[1] =~ /($accessions[0]-[0-9]+)/) {
            $isoform = $1;
            $self->add_to_direct_xrefs({
              stable_id  => $stable_id,
              type       => 'translation',
              acc        => $isoform,
              label      => $isoform,
              dbi        => $dbi,
              source_id  => $isoform_source_id,
              linkage    => 'DIRECT',
              species_id => $species_id
            });
          }
        }
           if (exists $dependent_sources{$source} ) {
          # create dependent xref structure & store it
          my %dep;
          $dep{SOURCE_NAME} = $source;
          $dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
          $dep{SOURCE_ID} = $dependent_sources{$source};

          if($source =~ /HGNC/){
            $acc =~ s/HGNC://;
            $extra[0] =~ s/[.]//;
            $dep{LABEL} = $extra[0];
          }
          $dep{ACCESSION} = $acc;

#          $dep{ACCESSION} = $acc;
          $dependent_xrefs{ $dep{SOURCE_NAME} }++; # get count of depenent xrefs.
          if(!defined($seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}})){
            push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
            $seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}} =1;
          }
          if($dep =~ /EMBL/ && !($dep =~ /ChEMBL/)){
            my ($protein_id) = $extra[0];
            if(($protein_id ne "-") and (!defined($seen{$source.":".$protein_id}))){
              my %dep2;
              $dep2{SOURCE_NAME} = $source;
              $dep2{SOURCE_ID} = $dependent_sources{"protein_id"};
              $dep2{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
              # store accession unversioned
              $dep2{LABEL} = $protein_id;
              my ($prot_acc, $prot_version) = $protein_id =~ /([^.]+)\.([^.]+)/;
              $dep2{ACCESSION} = $prot_acc;
              $dependent_xrefs{ $dep2{SOURCE_NAME} }++; # get count of dependent xrefs.
              $seen{$source.":".$protein_id} = 1;
              push @{$xref->{DEPENDENT_XREFS}}, \%dep2; # array of hashrefs
            }
          }
        }
      }
    }

    push @xrefs, $xref;

    if ($count > 1000) {
      $self->upload_xref_object_graphs(\@xrefs, $dbi);
      $count = 0;
      undef @xrefs;
    }

  }

  $self->upload_xref_object_graphs(\@xrefs, $dbi) if scalar(@xrefs) > 0;

  $uniprot_io->close();

  print "Read $num_sp SwissProt xrefs, $num_sptr SPTrEMBL xrefs with protein evidence codes 1-2, and $num_sptr_non_display SPTrEMBL xrefs with protein evidence codes > 2 from $file\n" if($verbose);
  print "Added $num_direct_sp direct SwissProt xrefs and $num_direct_sptr direct SPTrEMBL xrefs\n" if ($verbose);
  print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose);
  print "Skipped $ensembl_derived_protein_count ensembl annotations as Gene names\n";


#  print "$kount gene anmes added\n";

  print "Added the following dependent xrefs:-\n" if($verbose);
  foreach my $key (keys %dependent_xrefs){
    print $key."\t".$dependent_xrefs{$key}."\n" if($verbose);
  }
  print "End.\n" if ($verbose);

  #TODO - currently include records from other species - filter on OX line??
}

sub get_name {
  my $self = shift;
  my $acc  = shift;
  my $label = shift;

  return $acc;
}

sub get_hgnc_descriptions {
  my ($self, $hgnc_file) = @_;
  my %descriptions;

  my $hgnc_fh = $self->get_filehandle($hgnc_file);
  if ( !defined $hgnc_fh ) {confess "Can't open HGNC file '$hgnc_file'\n";}
  $hgnc_file = do { local $/; <$hgnc_fh> };

  my $input_file = Text::CSV->new({
    sep_char       => "\t",
    empty_is_undef => 1,
    binary         => 1,
    auto_diag      => 1
  }) or croak "Cannot use file $hgnc_file: ".Text::CSV->error_diag ();

  $hgnc_file = Encode::encode("UTF-8", $hgnc_file);
  $hgnc_file =~ s/"//xg;

  open my $hgnc_io, '<', \$hgnc_file or confess "Can't open HGNC file: $!\n";

  $input_file->column_names( @{ $input_file->getline( $hgnc_io ) } );

  while ( my $data = $input_file->getline_hr( $hgnc_io ) ) {
    my $gene_name   = $data->{'Approved symbol'};
    my $description = $data->{'Approved name'};

    $descriptions{$gene_name} = $description;
  }

  close $hgnc_io;

  return %descriptions;
}

1;
Ensembl / ensembl / #625445784

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous