• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

openvax / isovar / 11183330050

04 Oct 2024 04:14PM UTC coverage: 93.036% (-5.4%) from 98.446%
11183330050

push

github

iskandr
Merge branch 'master' of https://github.com/openvax/isovar

2084 of 2240 relevant lines covered (93.04%)

2.79 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.65
/isovar/reference_coding_sequence_key.py
1
# Licensed under the Apache License, Version 2.0 (the "License");
2
# you may not use this file except in compliance with the License.
3
# You may obtain a copy of the License at
4
#
5
#     http://www.apache.org/licenses/LICENSE-2.0
6
#
7
# Unless required by applicable law or agreed to in writing, software
8
# distributed under the License is distributed on an "AS IS" BASIS,
9
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
# See the License for the specific language governing permissions and
11
# limitations under the License.
12

13
from __future__ import print_function, division, absolute_import
3✔
14

15
from .logging import get_logger
3✔
16
from .reference_sequence_key import ReferenceSequenceKey
3✔
17
from .variant_helpers import interbase_range_affected_by_variant_on_transcript
3✔
18

19
logger = get_logger(__name__)
3✔
20

21

22
class ReferenceCodingSequenceKey(ReferenceSequenceKey):
3✔
23
    """
24
    ReferenceCodingSequenceKey includes all the fields of a ReferenceSequenceKey,
25
    and additionally tracks the reading frame and information about where the
26
    start codon and 5' UTR are relative to this sequence fragment.
27
    """
28

29
    # additional fields on top of ReferenceSequenceKey
30
    __slots__ = [
3✔
31
        # if the reference context includes the 5' UTR then
32
        # this is the offset to the start codon, otherwise it's the
33
        # offset needed to get the first base of a codon
34
        "offset_to_first_complete_codon",
35
        # does this context overlap a start codon?
36
        "overlaps_start_codon",
37
        # does this context contain the whole trinucleotide start codon?
38
        "contains_start_codon",
39
        # does this context contain any UTR bases?
40
        "contains_five_prime_utr",
41
        # translation of complete codons in the reference context
42
        # before the variant
43
        "amino_acids_before_variant"
44
    ]
45

46
    def __init__(
3✔
47
            self,
48
            strand,
49
            sequence_before_variant_locus,
50
            sequence_at_variant_locus,
51
            sequence_after_variant_locus,
52
            offset_to_first_complete_codon,
53
            contains_start_codon,
54
            overlaps_start_codon,
55
            contains_five_prime_utr,
56
            amino_acids_before_variant):
57
        ReferenceSequenceKey.__init__(
3✔
58
            self,
59
            strand=strand,
60
            sequence_before_variant_locus=sequence_before_variant_locus,
61
            sequence_at_variant_locus=sequence_at_variant_locus,
62
            sequence_after_variant_locus=sequence_after_variant_locus)
63
        self.offset_to_first_complete_codon = offset_to_first_complete_codon
3✔
64
        self.overlaps_start_codon = overlaps_start_codon
3✔
65
        self.contains_start_codon = contains_start_codon
3✔
66
        self.contains_five_prime_utr = contains_five_prime_utr
3✔
67
        self.amino_acids_before_variant = amino_acids_before_variant
3✔
68

69
    @classmethod
3✔
70
    def from_variant_and_transcript_and_sequence_key(
3✔
71
            cls, variant, transcript, sequence_key):
72
        """
73
        Assuming that the transcript has a coding sequence, take a
74
        ReferenceSequenceKey (region of the transcript around the variant) and
75
        return a ReferenceCodingSequenceKey (or None).
76
        """
77
        # get the interbase range of offsets which capture all reference
78
        # bases modified by the variant
79
        variant_start_offset, variant_end_offset = \
3✔
80
            interbase_range_affected_by_variant_on_transcript(
81
                variant=variant,
82
                transcript=transcript)
83

84
        start_codon_idx = min(transcript.start_codon_spliced_offsets)
3✔
85

86
        # skip any variants which occur in the 5' UTR or overlap the start codon
87
        # since
88
        #   (1) UTR variants have unclear coding effects and
89
        #   (2) start-loss variants may result in a new start codon / reading
90
        #       frame but we're not sure which one!
91
        if variant_start_offset < start_codon_idx + 3:
3✔
92
            logger.info(
3✔
93
                "Skipping transcript %s for variant %s, must be after start codon",
94
                transcript.name,
95
                variant)
96
            return None
3✔
97

98
        stop_codon_idx = min(transcript.stop_codon_spliced_offsets)
3✔
99

100
        # skip variants which affect the 3' UTR of the transcript since
101
        # they don't have obvious coding effects on the protein sequence
102
        if variant_start_offset >= stop_codon_idx + 3:
3✔
103
            logger.info(
×
104
                "Skipping transcript %s for variant %s, occurs in 3' UTR",
105
                transcript,
106
                variant)
107
            return None
×
108

109
        n_prefix = len(sequence_key.sequence_before_variant_locus)
3✔
110
        prefix_start_idx = variant_start_offset - n_prefix
3✔
111
        n_bases_between_start_and_variant = variant_start_offset - start_codon_idx
3✔
112
        n_full_codons_before_variant = n_bases_between_start_and_variant // 3
3✔
113

114
        # if the sequence before the variant contains more bases than the
115
        # distance to the start codon, then by definition it must contain
116
        # some untranslated bases
117
        contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant)
3✔
118
        # allows for the possibility that the first base in the sequence might
119
        # be the first nucleotide of the start codon
120
        contains_start_codon = (n_prefix >= n_bases_between_start_and_variant)
3✔
121
        # the sequence context might only include the 2nd or 3rd bases of
122
        # the start codon
123
        overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3)
3✔
124

125
        if contains_start_codon:
3✔
126
            offset_to_first_complete_codon = start_codon_idx - prefix_start_idx
3✔
127
            amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant]
3✔
128
        else:
129
            reading_frame = (prefix_start_idx - start_codon_idx) % 3
3✔
130
            offset_to_first_complete_codon = reading_frame_to_offset(reading_frame)
3✔
131
            n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3
3✔
132
            amino_acids_before_variant = transcript.protein_sequence[
3✔
133
                n_full_codons_before_variant - n_codons_in_prefix:
134
                n_full_codons_before_variant]
135

136
        return cls(
3✔
137
            strand=sequence_key.strand,
138
            sequence_before_variant_locus=sequence_key.sequence_before_variant_locus,
139
            sequence_at_variant_locus=sequence_key.sequence_at_variant_locus,
140
            sequence_after_variant_locus=sequence_key.sequence_after_variant_locus,
141
            offset_to_first_complete_codon=offset_to_first_complete_codon,
142
            contains_start_codon=contains_start_codon,
143
            overlaps_start_codon=overlaps_start_codon,
144
            contains_five_prime_utr=contains_five_prime_utr,
145
            amino_acids_before_variant=amino_acids_before_variant)
146

147
    @classmethod
3✔
148
    def from_variant_and_transcript(
3✔
149
            cls,
150
            variant,
151
            transcript,
152
            context_size):
153
        """
154
        Extracts the reference sequence around a variant locus on a particular
155
        transcript and determines the reading frame at the start of that
156
        sequence context.
157

158
        Parameters
159
        ----------
160
        variant : varcode.Variant
161

162
        transcript : pyensembl.Transcript
163

164
        context_size : int
165

166
        Returns SequenceKeyWithReadingFrame object or None if Transcript lacks
167
        coding sequence, protein sequence or annotated start/stop codons.
168
        """
169
        if not transcript.contains_start_codon:
3✔
170
            logger.info(
×
171
                "Expected transcript %s for variant %s to have start codon",
172
                transcript.name,
173
                variant)
174
            return None
×
175

176
        if not transcript.contains_stop_codon:
3✔
177
            logger.info(
×
178
                "Expected transcript %s for variant %s to have stop codon",
179
                transcript.name,
180
                variant)
181
            return None
×
182

183
        if not transcript.protein_sequence:
3✔
184
            logger.info(
×
185
                "Expected transript %s for variant %s to have protein sequence",
186
                transcript.name,
187
                variant)
188
            return None
×
189

190
        sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
3✔
191
            variant=variant,
192
            transcript=transcript,
193
            context_size=context_size)
194

195
        if sequence_key is None:
3✔
196
            logger.info(
×
197
                "No sequence key for variant %s on transcript %s",
198
                variant,
199
                transcript.name)
200
            return None
×
201

202
        return cls.from_variant_and_transcript_and_sequence_key(
3✔
203
            variant=variant,
204
            transcript=transcript,
205
            sequence_key=sequence_key)
206

207

208
def reading_frame_to_offset(reading_frame_at_start_of_sequence):
3✔
209
    """
210
    Given a reading frame (how many nucleotides into a codon) at the
211
    start of a cDNA sequence, return the number of nucleotides which need
212
    to be trimmed to start on a complete codon.
213

214
    Parameters
215
    ----------
216

217
    reading_frame_at_start_of_sequence : int
218

219
    Returns an int
220
    """
221
    if reading_frame_at_start_of_sequence < 0:
3✔
222
        raise ValueError("Reading frame can't be negative: %d" % (
×
223
            reading_frame_at_start_of_sequence,))
224
    elif reading_frame_at_start_of_sequence > 2:
3✔
225
        raise ValueError("Reading frame must be within 0 and 2, not %d" % (
×
226
            reading_frame_at_start_of_sequence,))
227
    # If we're 1 nucleotide into the codon then we need to shift
228
    # over two more to restore the ORF. Likewise, if we're 2 nucleotides in
229
    # then we have to shift over one more.
230
    return (3 - reading_frame_at_start_of_sequence) % 3
3✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc