11183330050

Committed 04 Oct 2024 04:14PM UTC coverage: 93.036% (-5.4%) from 98.446%

Build # 11183330050

Build Type

push

github

Committed by

iskandr

Commit Message

Merge branch 'master' of https://github.com/openvax/isovar

Run Details

2084 of 2240 relevant lines covered (93.04%)

2.79 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.65

/isovar/reference_coding_sequence_key.py

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function, division, absolute_import

from .logging import get_logger
from .reference_sequence_key import ReferenceSequenceKey
from .variant_helpers import interbase_range_affected_by_variant_on_transcript

logger = get_logger(__name__)


class ReferenceCodingSequenceKey(ReferenceSequenceKey):
    """
    ReferenceCodingSequenceKey includes all the fields of a ReferenceSequenceKey,
    and additionally tracks the reading frame and information about where the
    start codon and 5' UTR are relative to this sequence fragment.
    """

    # additional fields on top of ReferenceSequenceKey
    __slots__ = [
        # if the reference context includes the 5' UTR then
        # this is the offset to the start codon, otherwise it's the
        # offset needed to get the first base of a codon
        "offset_to_first_complete_codon",
        # does this context overlap a start codon?
        "overlaps_start_codon",
        # does this context contain the whole trinucleotide start codon?
        "contains_start_codon",
        # does this context contain any UTR bases?
        "contains_five_prime_utr",
        # translation of complete codons in the reference context
        # before the variant
        "amino_acids_before_variant"
    ]

    def __init__(
            self,
            strand,
            sequence_before_variant_locus,
            sequence_at_variant_locus,
            sequence_after_variant_locus,
            offset_to_first_complete_codon,
            contains_start_codon,
            overlaps_start_codon,
            contains_five_prime_utr,
            amino_acids_before_variant):
        ReferenceSequenceKey.__init__(
            self,
            strand=strand,
            sequence_before_variant_locus=sequence_before_variant_locus,
            sequence_at_variant_locus=sequence_at_variant_locus,
            sequence_after_variant_locus=sequence_after_variant_locus)
        self.offset_to_first_complete_codon = offset_to_first_complete_codon
        self.overlaps_start_codon = overlaps_start_codon
        self.contains_start_codon = contains_start_codon
        self.contains_five_prime_utr = contains_five_prime_utr
        self.amino_acids_before_variant = amino_acids_before_variant

    @classmethod
    def from_variant_and_transcript_and_sequence_key(
            cls, variant, transcript, sequence_key):
        """
        Assuming that the transcript has a coding sequence, take a
        ReferenceSequenceKey (region of the transcript around the variant) and
        return a ReferenceCodingSequenceKey (or None).
        """
        # get the interbase range of offsets which capture all reference
        # bases modified by the variant
        variant_start_offset, variant_end_offset = \
            interbase_range_affected_by_variant_on_transcript(
                variant=variant,
                transcript=transcript)

        start_codon_idx = min(transcript.start_codon_spliced_offsets)

        # skip any variants which occur in the 5' UTR or overlap the start codon
        # since
        #   (1) UTR variants have unclear coding effects and
        #   (2) start-loss variants may result in a new start codon / reading
        #       frame but we're not sure which one!
        if variant_start_offset < start_codon_idx + 3:
            logger.info(
                "Skipping transcript %s for variant %s, must be after start codon",
                transcript.name,
                variant)
            return None

        stop_codon_idx = min(transcript.stop_codon_spliced_offsets)

        # skip variants which affect the 3' UTR of the transcript since
        # they don't have obvious coding effects on the protein sequence
        if variant_start_offset >= stop_codon_idx + 3:
            logger.info(
                "Skipping transcript %s for variant %s, occurs in 3' UTR",
                transcript,
                variant)
            return None

        n_prefix = len(sequence_key.sequence_before_variant_locus)
        prefix_start_idx = variant_start_offset - n_prefix
        n_bases_between_start_and_variant = variant_start_offset - start_codon_idx
        n_full_codons_before_variant = n_bases_between_start_and_variant // 3

        # if the sequence before the variant contains more bases than the
        # distance to the start codon, then by definition it must contain
        # some untranslated bases
        contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant)
        # allows for the possibility that the first base in the sequence might
        # be the first nucleotide of the start codon
        contains_start_codon = (n_prefix >= n_bases_between_start_and_variant)
        # the sequence context might only include the 2nd or 3rd bases of
        # the start codon
        overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3)

        if contains_start_codon:
            offset_to_first_complete_codon = start_codon_idx - prefix_start_idx
            amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant]
        else:
            reading_frame = (prefix_start_idx - start_codon_idx) % 3
            offset_to_first_complete_codon = reading_frame_to_offset(reading_frame)
            n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3
            amino_acids_before_variant = transcript.protein_sequence[
                n_full_codons_before_variant - n_codons_in_prefix:
                n_full_codons_before_variant]

        return cls(
            strand=sequence_key.strand,
            sequence_before_variant_locus=sequence_key.sequence_before_variant_locus,
            sequence_at_variant_locus=sequence_key.sequence_at_variant_locus,
            sequence_after_variant_locus=sequence_key.sequence_after_variant_locus,
            offset_to_first_complete_codon=offset_to_first_complete_codon,
            contains_start_codon=contains_start_codon,
            overlaps_start_codon=overlaps_start_codon,
            contains_five_prime_utr=contains_five_prime_utr,
            amino_acids_before_variant=amino_acids_before_variant)

    @classmethod
    def from_variant_and_transcript(
            cls,
            variant,
            transcript,
            context_size):
        """
        Extracts the reference sequence around a variant locus on a particular
        transcript and determines the reading frame at the start of that
        sequence context.

        Parameters
        ----------
        variant : varcode.Variant

        transcript : pyensembl.Transcript

        context_size : int

        Returns SequenceKeyWithReadingFrame object or None if Transcript lacks
        coding sequence, protein sequence or annotated start/stop codons.
        """
        if not transcript.contains_start_codon:
            logger.info(
                "Expected transcript %s for variant %s to have start codon",
                transcript.name,
                variant)
            return None

        if not transcript.contains_stop_codon:
            logger.info(
                "Expected transcript %s for variant %s to have stop codon",
                transcript.name,
                variant)
            return None

        if not transcript.protein_sequence:
            logger.info(
                "Expected transript %s for variant %s to have protein sequence",
                transcript.name,
                variant)
            return None

        sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
            variant=variant,
            transcript=transcript,
            context_size=context_size)

        if sequence_key is None:
            logger.info(
                "No sequence key for variant %s on transcript %s",
                variant,
                transcript.name)
            return None

        return cls.from_variant_and_transcript_and_sequence_key(
            variant=variant,
            transcript=transcript,
            sequence_key=sequence_key)


def reading_frame_to_offset(reading_frame_at_start_of_sequence):
    """
    Given a reading frame (how many nucleotides into a codon) at the
    start of a cDNA sequence, return the number of nucleotides which need
    to be trimmed to start on a complete codon.

    Parameters
    ----------

    reading_frame_at_start_of_sequence : int

    Returns an int
    """
    if reading_frame_at_start_of_sequence < 0:
        raise ValueError("Reading frame can't be negative: %d" % (
            reading_frame_at_start_of_sequence,))
    elif reading_frame_at_start_of_sequence > 2:
        raise ValueError("Reading frame must be within 0 and 2, not %d" % (
            reading_frame_at_start_of_sequence,))
    # If we're 1 nucleotide into the codon then we need to shift
    # over two more to restore the ORF. Likewise, if we're 2 nucleotides in
    # then we have to shift over one more.
    return (3 - reading_frame_at_start_of_sequence) % 3

1	# Licensed under the Apache License, Version 2.0 (the "License");
2	# you may not use this file except in compliance with the License.
3	# You may obtain a copy of the License at
4	#
5	# http://www.apache.org/licenses/LICENSE-2.0
6	#
7	# Unless required by applicable law or agreed to in writing, software
8	# distributed under the License is distributed on an "AS IS" BASIS,
9	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10	# See the License for the specific language governing permissions and
11	# limitations under the License.
12
13	from __future__ import print_function, division, absolute_import	3✔
14
15	from .logging import get_logger	3✔
16	from .reference_sequence_key import ReferenceSequenceKey	3✔
17	from .variant_helpers import interbase_range_affected_by_variant_on_transcript	3✔
18
19	logger = get_logger(__name__)	3✔
20
21
22	class ReferenceCodingSequenceKey(ReferenceSequenceKey):	3✔
23	"""
24	ReferenceCodingSequenceKey includes all the fields of a ReferenceSequenceKey,
25	and additionally tracks the reading frame and information about where the
26	start codon and 5' UTR are relative to this sequence fragment.
27	"""
28
29	# additional fields on top of ReferenceSequenceKey
30	__slots__ = [	3✔
31	# if the reference context includes the 5' UTR then
32	# this is the offset to the start codon, otherwise it's the
33	# offset needed to get the first base of a codon
34	"offset_to_first_complete_codon",
35	# does this context overlap a start codon?
36	"overlaps_start_codon",
37	# does this context contain the whole trinucleotide start codon?
38	"contains_start_codon",
39	# does this context contain any UTR bases?
40	"contains_five_prime_utr",
41	# translation of complete codons in the reference context
42	# before the variant
43	"amino_acids_before_variant"
44	]
45
46	def __init__(	3✔
47	self,
48	strand,
49	sequence_before_variant_locus,
50	sequence_at_variant_locus,
51	sequence_after_variant_locus,
52	offset_to_first_complete_codon,
53	contains_start_codon,
54	overlaps_start_codon,
55	contains_five_prime_utr,
56	amino_acids_before_variant):
57	ReferenceSequenceKey.__init__(	3✔
58	self,
59	strand=strand,
60	sequence_before_variant_locus=sequence_before_variant_locus,
61	sequence_at_variant_locus=sequence_at_variant_locus,
62	sequence_after_variant_locus=sequence_after_variant_locus)
63	self.offset_to_first_complete_codon = offset_to_first_complete_codon	3✔
64	self.overlaps_start_codon = overlaps_start_codon	3✔
65	self.contains_start_codon = contains_start_codon	3✔
66	self.contains_five_prime_utr = contains_five_prime_utr	3✔
67	self.amino_acids_before_variant = amino_acids_before_variant	3✔
68
69	@classmethod	3✔
70	def from_variant_and_transcript_and_sequence_key(	3✔
71	cls, variant, transcript, sequence_key):
72	"""
73	Assuming that the transcript has a coding sequence, take a
74	ReferenceSequenceKey (region of the transcript around the variant) and
75	return a ReferenceCodingSequenceKey (or None).
76	"""
77	# get the interbase range of offsets which capture all reference
78	# bases modified by the variant
79	variant_start_offset, variant_end_offset = \	3✔
80	interbase_range_affected_by_variant_on_transcript(
81	variant=variant,
82	transcript=transcript)
83
84	start_codon_idx = min(transcript.start_codon_spliced_offsets)	3✔
85
86	# skip any variants which occur in the 5' UTR or overlap the start codon
87	# since
88	# (1) UTR variants have unclear coding effects and
89	# (2) start-loss variants may result in a new start codon / reading
90	# frame but we're not sure which one!
91	if variant_start_offset < start_codon_idx + 3:	3✔
92	logger.info(	3✔
93	"Skipping transcript %s for variant %s, must be after start codon",
94	transcript.name,
95	variant)
96	return None	3✔
97
98	stop_codon_idx = min(transcript.stop_codon_spliced_offsets)	3✔
99
100	# skip variants which affect the 3' UTR of the transcript since
101	# they don't have obvious coding effects on the protein sequence
102	if variant_start_offset >= stop_codon_idx + 3:	3✔
103	logger.info(	×
104	"Skipping transcript %s for variant %s, occurs in 3' UTR",
105	transcript,
106	variant)
107	return None	×
108
109	n_prefix = len(sequence_key.sequence_before_variant_locus)	3✔
110	prefix_start_idx = variant_start_offset - n_prefix	3✔
111	n_bases_between_start_and_variant = variant_start_offset - start_codon_idx	3✔
112	n_full_codons_before_variant = n_bases_between_start_and_variant // 3	3✔
113
114	# if the sequence before the variant contains more bases than the
115	# distance to the start codon, then by definition it must contain
116	# some untranslated bases
117	contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant)	3✔
118	# allows for the possibility that the first base in the sequence might
119	# be the first nucleotide of the start codon
120	contains_start_codon = (n_prefix >= n_bases_between_start_and_variant)	3✔
121	# the sequence context might only include the 2nd or 3rd bases of
122	# the start codon
123	overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3)	3✔
124
125	if contains_start_codon:	3✔
126	offset_to_first_complete_codon = start_codon_idx - prefix_start_idx	3✔
127	amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant]	3✔
128	else:
129	reading_frame = (prefix_start_idx - start_codon_idx) % 3	3✔
130	offset_to_first_complete_codon = reading_frame_to_offset(reading_frame)	3✔
131	n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3	3✔
132	amino_acids_before_variant = transcript.protein_sequence[	3✔
133	n_full_codons_before_variant - n_codons_in_prefix:
134	n_full_codons_before_variant]
135
136	return cls(	3✔
137	strand=sequence_key.strand,
138	sequence_before_variant_locus=sequence_key.sequence_before_variant_locus,
139	sequence_at_variant_locus=sequence_key.sequence_at_variant_locus,
140	sequence_after_variant_locus=sequence_key.sequence_after_variant_locus,
141	offset_to_first_complete_codon=offset_to_first_complete_codon,
142	contains_start_codon=contains_start_codon,
143	overlaps_start_codon=overlaps_start_codon,
144	contains_five_prime_utr=contains_five_prime_utr,
145	amino_acids_before_variant=amino_acids_before_variant)
146
147	@classmethod	3✔
148	def from_variant_and_transcript(	3✔
149	cls,
150	variant,
151	transcript,
152	context_size):
153	"""
154	Extracts the reference sequence around a variant locus on a particular
155	transcript and determines the reading frame at the start of that
156	sequence context.
157
158	Parameters
159	----------
160	variant : varcode.Variant
161
162	transcript : pyensembl.Transcript
163
164	context_size : int
165
166	Returns SequenceKeyWithReadingFrame object or None if Transcript lacks
167	coding sequence, protein sequence or annotated start/stop codons.
168	"""
169	if not transcript.contains_start_codon:	3✔
170	logger.info(	×
171	"Expected transcript %s for variant %s to have start codon",
172	transcript.name,
173	variant)
174	return None	×
175
176	if not transcript.contains_stop_codon:	3✔
177	logger.info(	×
178	"Expected transcript %s for variant %s to have stop codon",
179	transcript.name,
180	variant)
181	return None	×
182
183	if not transcript.protein_sequence:	3✔
184	logger.info(	×
185	"Expected transript %s for variant %s to have protein sequence",
186	transcript.name,
187	variant)
188	return None	×
189
190	sequence_key = ReferenceSequenceKey.from_variant_and_transcript(	3✔
191	variant=variant,
192	transcript=transcript,
193	context_size=context_size)
194
195	if sequence_key is None:	3✔
196	logger.info(	×
197	"No sequence key for variant %s on transcript %s",
198	variant,
199	transcript.name)
200	return None	×
201
202	return cls.from_variant_and_transcript_and_sequence_key(	3✔
203	variant=variant,
204	transcript=transcript,
205	sequence_key=sequence_key)
206
207
208	def reading_frame_to_offset(reading_frame_at_start_of_sequence):	3✔
209	"""
210	Given a reading frame (how many nucleotides into a codon) at the
211	start of a cDNA sequence, return the number of nucleotides which need
212	to be trimmed to start on a complete codon.
213
214	Parameters
215	----------
216
217	reading_frame_at_start_of_sequence : int
218
219	Returns an int
220	"""
221	if reading_frame_at_start_of_sequence < 0:	3✔
222	raise ValueError("Reading frame can't be negative: %d" % (	×
223	reading_frame_at_start_of_sequence,))
224	elif reading_frame_at_start_of_sequence > 2:	3✔
225	raise ValueError("Reading frame must be within 0 and 2, not %d" % (	×
226	reading_frame_at_start_of_sequence,))
227	# If we're 1 nucleotide into the codon then we need to shift
228	# over two more to restore the ORF. Likewise, if we're 2 nucleotides in
229	# then we have to shift over one more.
230	return (3 - reading_frame_at_start_of_sequence) % 3	3✔

openvax / isovar / 11183330050

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous