8833752981

Committed 25 Apr 2024 01:44PM UTC coverage: 83.019% (-13.0%) from 96.024%

Build # 8833752981

Build Type

push

github

Committed by

iskandr

Commit Message

fixed pyensembl list with lower min version

Run Details

2 of 2 new or added lines in 2 files covered. (100.0%)

211 existing lines in 11 files now uncovered.

1320 of 1590 relevant lines covered (83.02%)

2.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.56

/pyensembl/transcript.py

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from memoized_property import memoized_property

from .common import memoize
from .locus_with_genome import LocusWithGenome


class Transcript(LocusWithGenome):
    """
    Transcript encompasses the locus, exons, and sequence of a transcript.

    Lazily fetches sequence in case we"re constructing many Transcripts
    and not using the sequence, avoid the memory/performance overhead
    of fetching and storing sequences from a FASTA file.
    """

    def __init__(
        self,
        transcript_id,
        transcript_name,
        contig,
        start,
        end,
        strand,
        biotype,
        gene_id,
        genome,
        support_level=None,
    ):
        LocusWithGenome.__init__(
            self,
            contig=contig,
            start=start,
            end=end,
            strand=strand,
            biotype=biotype,
            genome=genome,
        )
        self.transcript_id = transcript_id
        self.transcript_name = transcript_name
        self.gene_id = gene_id
        self.support_level = support_level

    @property
    def id(self):
        """
        Alias for transcript_id necessary for backward compatibility.
        """
        return self.transcript_id

    @property
    def name(self):
        """
        Alias for transcript_name necessary for backward compatibility.
        """
        return self.transcript_name

    def __str__(self):
        return (
            "Transcript(transcript_id='%s',"
            " transcript_name='%s',"
            " gene_id='%s',"
            " biotype='%s',"
            " contig='%s',"
            " start=%d,"
            " end=%d, strand='%s', genome='%s')"
        ) % (
            self.transcript_id,
            self.name,
            self.gene_id,
            self.biotype,
            self.contig,
            self.start,
            self.end,
            self.strand,
            self.genome.reference_name,
        )

    def __len__(self):
        """
        Length of a transcript is the sum of its exon lengths
        """
        return sum(len(exon) for exon in self.exons)

    def __eq__(self, other):
        return (
            other.__class__ is Transcript
            and self.id == other.id
            and self.genome == other.genome
        )

    def __hash__(self):
        return hash(self.id)

    def to_dict(self):
        state_dict = LocusWithGenome.to_dict(self)
        state_dict["transcript_id"] = self.transcript_id
        state_dict["transcript_name"] = self.name
        state_dict["gene_id"] = self.gene_id
        state_dict["support_level"] = self.support_level
        return state_dict

    @property
    def gene(self):
        return self.genome.gene_by_id(self.gene_id)

    @property
    def gene_name(self):
        return self.gene.name

    @property
    def exons(self):
        # need to look up exon_number alongside ID since each exon may
        # appear in multiple transcripts and have a different exon number
        # in each transcript
        columns = ["exon_number", "exon_id"]
        exon_numbers_and_ids = self.db.query(
            columns, filter_column="transcript_id", filter_value=self.id, feature="exon"
        )

        # fill this list in its correct order (by exon_number) by using
        # the exon_number as a 1-based list offset
        exons = [None] * len(exon_numbers_and_ids)

        for exon_number, exon_id in exon_numbers_and_ids:
            exon = self.genome.exon_by_id(exon_id)
            if exon is None:
                raise ValueError(
                    "Missing exon %s for transcript %s" % (exon_number, self.id)
                )
            exon_number = int(exon_number)
            if exon_number < 1:
                raise ValueError("Invalid exon number: %s" % exon_number)
            elif exon_number > len(exons):
                raise ValueError(
                    "Invalid exon number: %s (max expected = %d)"
                    % (exon_number, len(exons))
                )

            # exon_number is 1-based, convert to list index by subtracting 1
            exon_idx = exon_number - 1
            exons[exon_idx] = exon
        return exons

    # possible annotations associated with transcripts
    _TRANSCRIPT_FEATURES = {"start_codon", "stop_codon", "UTR", "CDS"}

    @memoize
    def _transcript_feature_position_ranges(self, feature, required=True):
        """
        Find start/end chromosomal position range of features
        (such as start codon) for this transcript.
        """
        if feature not in self._TRANSCRIPT_FEATURES:
            raise ValueError("Invalid transcript feature: %s" % feature)

        results = self.db.query(
            select_column_names=["start", "end"],
            filter_column="transcript_id",
            filter_value=self.id,
            feature=feature,
        )

        if required and len(results) == 0:
            raise ValueError(
                "Transcript %s does not contain feature %s" % (self.id, feature)
            )
        return results

    @memoize
    def _transcript_feature_positions(self, feature):
        """
        Get unique positions for feature, raise an error if feature is absent.
        """
        ranges = self._transcript_feature_position_ranges(feature, required=True)
        results = []
        # a feature (such as a stop codon), maybe be split over multiple
        # contiguous ranges. Collect all the nucleotide positions into a
        # single list.
        for start, end in ranges:
            # since ranges are [inclusive, inclusive] and
            # Python ranges are [inclusive, exclusive) we have to increment
            # the end position
            for position in range(start, end + 1):
                if position in results:
                    raise ValueError(
                        "Repeated position %d for %s" % (position, feature)
                    )
                results.append(position)
        return results

    @memoize
    def _codon_positions(self, feature):
        """
        Parameters
        ----------
        feature : str
            Possible values are "start_codon" or "stop_codon"

        Returns list of three chromosomal positions.
        """
        results = self._transcript_feature_positions(feature)
        if len(results) != 3:
            raise ValueError(
                "Expected 3 positions for %s of %s but got %d"
                % (feature, self.id, len(results))
            )
        return results

    @memoized_property
    def contains_start_codon(self):
        """
        Does this transcript have an annotated start_codon entry?
        """
        start_codons = self._transcript_feature_position_ranges(
            "start_codon", required=False
        )
        return len(start_codons) > 0

    @memoized_property
    def contains_stop_codon(self):
        """
        Does this transcript have an annotated stop_codon entry?
        """
        stop_codons = self._transcript_feature_position_ranges(
            "stop_codon", required=False
        )
        return len(stop_codons) > 0

    @memoized_property
    def start_codon_complete(self):
        """
        Does the start codon span 3 genomic positions?
        """
        try:
            self._codon_positions("start_codon")
        except ValueError:
            return False
        return True

    @memoized_property
    def start_codon_positions(self):
        """
        Chromosomal positions of nucleotides in start codon.
        """
        return self._codon_positions("start_codon")

    @memoized_property
    def stop_codon_positions(self):
        """
        Chromosomal positions of nucleotides in stop codon.
        """
        return self._codon_positions("stop_codon")

    @memoized_property
    def exon_intervals(self):
        """List of (start,end) tuples for each exon of this transcript,
        in the order specified by the 'exon_number' column of the
        exon table.
        """
        results = self.db.query(
            select_column_names=["exon_number", "start", "end"],
            filter_column="transcript_id",
            filter_value=self.id,
            feature="exon",
        )
        sorted_intervals = [None] * len(results)
        for exon_number, start, end in results:
            sorted_intervals[int(exon_number) - 1] = (start, end)
        return sorted_intervals

    def spliced_offset(self, position):
        """
        Convert from an absolute chromosomal position to the offset into
        this transcript"s spliced mRNA.

        Position must be inside some exon (otherwise raise exception).
        """
        if type(position) is not int:
            raise TypeError(
                "Position argument must be an integer, got %s : %s"
                % (position, type(position))
            )

        if position < self.start or position > self.end:
            raise ValueError(
                "Invalid position: %d (must be between %d and %d)"
                % (position, self.start, self.end)
            )

        # offset from beginning of unspliced transcript (including introns)
        unspliced_offset = self.offset(position)
        total_spliced_offset = 0

        # traverse exons in order of their appearance on the strand
        # Since absolute positions may decrease if on the negative strand,
        # we instead use unspliced offsets to get always increasing indices.
        #
        # Example:
        #
        # Exon Name:                exon 1                exon 2
        # Spliced Offset:           123456                789...
        # Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii...
        for exon in self.exons:
            exon_unspliced_start, exon_unspliced_end = self.offset_range(
                exon.start, exon.end
            )
            # If the relative position is not within this exon, keep a running
            # total of the total exonic length-so-far.
            #
            # Otherwise, if the relative position is within an exon, get its
            # offset into that exon by subtracting the exon"s relative start
            # position from the relative position. Add that to the total exonic
            # length-so-far.
            if exon_unspliced_start <= unspliced_offset <= exon_unspliced_end:
                # all offsets are base 0, can be used as indices into
                # sequence string
                exon_offset = unspliced_offset - exon_unspliced_start
                return total_spliced_offset + exon_offset
            else:
                exon_length = len(exon)  # exon_end_position - exon_start_position + 1
                total_spliced_offset += exon_length
        raise ValueError(
            "Couldn't find position %d on any exon of %s" % (position, self.id)
        )

    @memoized_property
    def start_codon_unspliced_offsets(self):
        """
        Offsets from start of unspliced pre-mRNA transcript
        of nucleotides in start codon.
        """
        return [self.offset(position) for position in self.start_codon_positions]

    @memoized_property
    def stop_codon_unspliced_offsets(self):
        """
        Offsets from start of unspliced pre-mRNA transcript
        of nucleotides in stop codon.
        """
        return [self.offset(position) for position in self.stop_codon_positions]

    def _contiguous_offsets(self, offsets):
        """
        Sorts the input list of integer offsets,
        ensures that values are contiguous.
        """
        offsets.sort()
        for i in range(len(offsets) - 1):
            if offsets[i] + 1 != offsets[i + 1]:
                raise ValueError("Offsets not contiguous: %s" % (offsets,))
        return offsets

    @memoized_property
    def start_codon_spliced_offsets(self):
        """
        Offsets from start of spliced mRNA transcript
        of nucleotides in start codon.
        """
        offsets = [
            self.spliced_offset(position) for position in self.start_codon_positions
        ]
        return self._contiguous_offsets(offsets)

    @memoized_property
    def stop_codon_spliced_offsets(self):
        """
        Offsets from start of spliced mRNA transcript
        of nucleotides in stop codon.
        """
        offsets = [
            self.spliced_offset(position) for position in self.stop_codon_positions
        ]
        return self._contiguous_offsets(offsets)

    @memoized_property
    def coding_sequence_position_ranges(self):
        """
        Return absolute chromosome position ranges for CDS fragments
        of this transcript
        """
        return self._transcript_feature_position_ranges("CDS")

    @memoized_property
    def complete(self):
        """
        Consider a transcript complete if it has start and stop codons and
        a coding sequence whose length is divisible by 3
        """
        return (
            self.contains_start_codon
            and self.start_codon_complete
            and self.contains_stop_codon
            and self.coding_sequence is not None
            and len(self.coding_sequence) % 3 == 0
        )

    @memoized_property
    def sequence(self):
        """
        Spliced cDNA sequence of transcript
        (includes 5" UTR, coding sequence, and 3" UTR)
        """
        transcript_id = self.transcript_id
        if transcript_id.startswith("ENS"):
            transcript_id = transcript_id.rsplit(".", 1)[0]
        return self.genome.transcript_sequences.get(transcript_id)

    @memoized_property
    def first_start_codon_spliced_offset(self):
        """
        Offset of first nucleotide in start codon into the spliced mRNA
        (excluding introns)
        """
        start_offsets = self.start_codon_spliced_offsets
        return min(start_offsets)

    @memoized_property
    def last_stop_codon_spliced_offset(self):
        """
        Offset of last nucleotide in stop codon into the spliced mRNA
        (excluding introns)
        """
        stop_offsets = self.stop_codon_spliced_offsets
        return max(stop_offsets)

    @memoized_property
    def coding_sequence(self):
        """
        cDNA coding sequence (from start codon to stop codon, without
        any introns)
        """
        if self.sequence is None:
            return None

        start = self.first_start_codon_spliced_offset
        end = self.last_stop_codon_spliced_offset

        # If start codon is the at nucleotide offsets [3,4,5] and
        # stop codon is at nucleotide offsets  [20,21,22]
        # then start = 3 and end = 22.
        #
        # Adding 1 to end since Python uses non-inclusive ends in slices/ranges.

        # pylint: disable=invalid-slice-index
        # TODO(tavi) Figure out pylint is not happy with this slice
        return self.sequence[start : end + 1]

    @memoized_property
    def five_prime_utr_sequence(self):
        """
        cDNA sequence of 5' UTR
        (untranslated region at the beginning of the transcript)
        """
        # pylint: disable=invalid-slice-index
        # TODO(tavi) Figure out pylint is not happy with this slice
        return self.sequence[: self.first_start_codon_spliced_offset]

    @memoized_property
    def three_prime_utr_sequence(self):
        """
        cDNA sequence of 3' UTR
        (untranslated region at the end of the transcript)
        """
        return self.sequence[self.last_stop_codon_spliced_offset + 1 :]

    @memoized_property
    def protein_id(self):
        result_tuple = self.db.query_one(
            select_column_names=["protein_id"],
            filter_column="transcript_id",
            filter_value=self.id,
            feature="CDS",
            distinct=True,
            required=False,
        )
        if result_tuple:
            return result_tuple[0]
        else:
            return None

    @memoized_property
    def protein_sequence(self):
        if self.protein_id:
            return self.genome.protein_sequences.get(self.protein_id)
        else:
            return None

1	# Licensed under the Apache License, Version 2.0 (the "License");
2	# you may not use this file except in compliance with the License.
3	# You may obtain a copy of the License at
4	#
5	# http://www.apache.org/licenses/LICENSE-2.0
6	#
7	# Unless required by applicable law or agreed to in writing, software
8	# distributed under the License is distributed on an "AS IS" BASIS,
9	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10	# See the License for the specific language governing permissions and
11	# limitations under the License.
12
13	from memoized_property import memoized_property	3✔
14
15	from .common import memoize	3✔
16	from .locus_with_genome import LocusWithGenome	3✔
17
18
19	class Transcript(LocusWithGenome):	3✔
20	"""
21	Transcript encompasses the locus, exons, and sequence of a transcript.
22
23	Lazily fetches sequence in case we"re constructing many Transcripts
24	and not using the sequence, avoid the memory/performance overhead
25	of fetching and storing sequences from a FASTA file.
26	"""
27
28	def __init__(	3✔
29	self,
30	transcript_id,
31	transcript_name,
32	contig,
33	start,
34	end,
35	strand,
36	biotype,
37	gene_id,
38	genome,
39	support_level=None,
40	):
41	LocusWithGenome.__init__(	3✔
42	self,
43	contig=contig,
44	start=start,
45	end=end,
46	strand=strand,
47	biotype=biotype,
48	genome=genome,
49	)
50	self.transcript_id = transcript_id	3✔
51	self.transcript_name = transcript_name	3✔
52	self.gene_id = gene_id	3✔
53	self.support_level = support_level	3✔
54
55	@property	3✔
56	def id(self):	3✔
57	"""
58	Alias for transcript_id necessary for backward compatibility.
59	"""
60	return self.transcript_id	3✔
61
62	@property	3✔
63	def name(self):	3✔
64	"""
65	Alias for transcript_name necessary for backward compatibility.
66	"""
67	return self.transcript_name	3✔
68
69	def __str__(self):	3✔
70	return (	3✔
71	"Transcript(transcript_id='%s',"
72	" transcript_name='%s',"
73	" gene_id='%s',"
74	" biotype='%s',"
75	" contig='%s',"
76	" start=%d,"
77	" end=%d, strand='%s', genome='%s')"
78	) % (
79	self.transcript_id,
80	self.name,
81	self.gene_id,
82	self.biotype,
83	self.contig,
84	self.start,
85	self.end,
86	self.strand,
87	self.genome.reference_name,
88	)
89
90	def __len__(self):	3✔
91	"""
92	Length of a transcript is the sum of its exon lengths
93	"""
94	return sum(len(exon) for exon in self.exons)	3✔
95
96	def __eq__(self, other):	3✔
97	return (	3✔
98	other.__class__ is Transcript
99	and self.id == other.id
100	and self.genome == other.genome
101	)
102
103	def __hash__(self):	3✔
104	return hash(self.id)	3✔
105
106	def to_dict(self):	3✔
107	state_dict = LocusWithGenome.to_dict(self)	3✔
108	state_dict["transcript_id"] = self.transcript_id	3✔
109	state_dict["transcript_name"] = self.name	3✔
110	state_dict["gene_id"] = self.gene_id	3✔
111	state_dict["support_level"] = self.support_level	3✔
112	return state_dict	3✔
113
114	@property	3✔
115	def gene(self):	3✔
116	return self.genome.gene_by_id(self.gene_id)	3✔
117
118	@property	3✔
119	def gene_name(self):	3✔
UNCOV 120	return self.gene.name	×
121
122	@property	3✔
123	def exons(self):	3✔
124	# need to look up exon_number alongside ID since each exon may
125	# appear in multiple transcripts and have a different exon number
126	# in each transcript
127	columns = ["exon_number", "exon_id"]	3✔
128	exon_numbers_and_ids = self.db.query(	3✔
129	columns, filter_column="transcript_id", filter_value=self.id, feature="exon"
130	)
131
132	# fill this list in its correct order (by exon_number) by using
133	# the exon_number as a 1-based list offset
134	exons = [None] * len(exon_numbers_and_ids)	3✔
135
136	for exon_number, exon_id in exon_numbers_and_ids:	3✔
137	exon = self.genome.exon_by_id(exon_id)	3✔
138	if exon is None:	3✔
UNCOV 139	raise ValueError(	×
140	"Missing exon %s for transcript %s" % (exon_number, self.id)
141	)
142	exon_number = int(exon_number)	3✔
143	if exon_number < 1:	3✔
UNCOV 144	raise ValueError("Invalid exon number: %s" % exon_number)	×
145	elif exon_number > len(exons):	3✔
UNCOV 146	raise ValueError(	×
147	"Invalid exon number: %s (max expected = %d)"
148	% (exon_number, len(exons))
149	)
150
151	# exon_number is 1-based, convert to list index by subtracting 1
152	exon_idx = exon_number - 1	3✔
153	exons[exon_idx] = exon	3✔
154	return exons	3✔
155
156	# possible annotations associated with transcripts
157	_TRANSCRIPT_FEATURES = {"start_codon", "stop_codon", "UTR", "CDS"}	3✔
158
159	@memoize	3✔
160	def _transcript_feature_position_ranges(self, feature, required=True):	3✔
161	"""
162	Find start/end chromosomal position range of features
163	(such as start codon) for this transcript.
164	"""
165	if feature not in self._TRANSCRIPT_FEATURES:	3✔
UNCOV 166	raise ValueError("Invalid transcript feature: %s" % feature)	×
167
168	results = self.db.query(	3✔
169	select_column_names=["start", "end"],
170	filter_column="transcript_id",
171	filter_value=self.id,
172	feature=feature,
173	)
174
175	if required and len(results) == 0:	3✔
UNCOV 176	raise ValueError(	×
177	"Transcript %s does not contain feature %s" % (self.id, feature)
178	)
179	return results	3✔
180
181	@memoize	3✔
182	def _transcript_feature_positions(self, feature):	3✔
183	"""
184	Get unique positions for feature, raise an error if feature is absent.
185	"""
186	ranges = self._transcript_feature_position_ranges(feature, required=True)	3✔
187	results = []	3✔
188	# a feature (such as a stop codon), maybe be split over multiple
189	# contiguous ranges. Collect all the nucleotide positions into a
190	# single list.
191	for start, end in ranges:	3✔
192	# since ranges are [inclusive, inclusive] and
193	# Python ranges are [inclusive, exclusive) we have to increment
194	# the end position
195	for position in range(start, end + 1):	3✔
196	if position in results:	3✔
UNCOV 197	raise ValueError(	×
198	"Repeated position %d for %s" % (position, feature)
199	)
200	results.append(position)	3✔
201	return results	3✔
202
203	@memoize	3✔
204	def _codon_positions(self, feature):	3✔
205	"""
206	Parameters
207	----------
208	feature : str
209	Possible values are "start_codon" or "stop_codon"
210
211	Returns list of three chromosomal positions.
212	"""
213	results = self._transcript_feature_positions(feature)	3✔
214	if len(results) != 3:	3✔
UNCOV 215	raise ValueError(	×
216	"Expected 3 positions for %s of %s but got %d"
217	% (feature, self.id, len(results))
218	)
219	return results	3✔
220
221	@memoized_property	3✔
222	def contains_start_codon(self):	3✔
223	"""
224	Does this transcript have an annotated start_codon entry?
225	"""
226	start_codons = self._transcript_feature_position_ranges(	3✔
227	"start_codon", required=False
228	)
229	return len(start_codons) > 0	3✔
230
231	@memoized_property	3✔
232	def contains_stop_codon(self):	3✔
233	"""
234	Does this transcript have an annotated stop_codon entry?
235	"""
236	stop_codons = self._transcript_feature_position_ranges(	3✔
237	"stop_codon", required=False
238	)
239	return len(stop_codons) > 0	3✔
240
241	@memoized_property	3✔
242	def start_codon_complete(self):	3✔
243	"""
244	Does the start codon span 3 genomic positions?
245	"""
246	try:	3✔
247	self._codon_positions("start_codon")	3✔
UNCOV 248	except ValueError:	×
UNCOV 249	return False	×
250	return True	3✔
251
252	@memoized_property	3✔
253	def start_codon_positions(self):	3✔
254	"""
255	Chromosomal positions of nucleotides in start codon.
256	"""
257	return self._codon_positions("start_codon")	3✔
258
259	@memoized_property	3✔
260	def stop_codon_positions(self):	3✔
261	"""
262	Chromosomal positions of nucleotides in stop codon.
263	"""
264	return self._codon_positions("stop_codon")	3✔
265
266	@memoized_property	3✔
267	def exon_intervals(self):	3✔
268	"""List of (start,end) tuples for each exon of this transcript,
269	in the order specified by the 'exon_number' column of the
270	exon table.
271	"""
UNCOV 272	results = self.db.query(	×
273	select_column_names=["exon_number", "start", "end"],
274	filter_column="transcript_id",
275	filter_value=self.id,
276	feature="exon",
277	)
UNCOV 278	sorted_intervals = [None] * len(results)	×
UNCOV 279	for exon_number, start, end in results:	×
UNCOV 280	sorted_intervals[int(exon_number) - 1] = (start, end)	×
UNCOV 281	return sorted_intervals	×
282
283	def spliced_offset(self, position):	3✔
284	"""
285	Convert from an absolute chromosomal position to the offset into
286	this transcript"s spliced mRNA.
287
288	Position must be inside some exon (otherwise raise exception).
289	"""
290	if type(position) is not int:	3✔
UNCOV 291	raise TypeError(	×
292	"Position argument must be an integer, got %s : %s"
293	% (position, type(position))
294	)
295
296	if position < self.start or position > self.end:	3✔
UNCOV 297	raise ValueError(	×
298	"Invalid position: %d (must be between %d and %d)"
299	% (position, self.start, self.end)
300	)
301
302	# offset from beginning of unspliced transcript (including introns)
303	unspliced_offset = self.offset(position)	3✔
304	total_spliced_offset = 0	3✔
305
306	# traverse exons in order of their appearance on the strand
307	# Since absolute positions may decrease if on the negative strand,
308	# we instead use unspliced offsets to get always increasing indices.
309	#
310	# Example:
311	#
312	# Exon Name: exon 1 exon 2
313	# Spliced Offset: 123456 789...
314	# Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii...
315	for exon in self.exons:	3✔
316	exon_unspliced_start, exon_unspliced_end = self.offset_range(	3✔
317	exon.start, exon.end
318	)
319	# If the relative position is not within this exon, keep a running
320	# total of the total exonic length-so-far.
321	#
322	# Otherwise, if the relative position is within an exon, get its
323	# offset into that exon by subtracting the exon"s relative start
324	# position from the relative position. Add that to the total exonic
325	# length-so-far.
326	if exon_unspliced_start <= unspliced_offset <= exon_unspliced_end:	3✔
327	# all offsets are base 0, can be used as indices into
328	# sequence string
329	exon_offset = unspliced_offset - exon_unspliced_start	3✔
330	return total_spliced_offset + exon_offset	3✔
331	else:
332	exon_length = len(exon) # exon_end_position - exon_start_position + 1	3✔
333	total_spliced_offset += exon_length	3✔
UNCOV 334	raise ValueError(	×
335	"Couldn't find position %d on any exon of %s" % (position, self.id)
336	)
337
338	@memoized_property	3✔
339	def start_codon_unspliced_offsets(self):	3✔
340	"""
341	Offsets from start of unspliced pre-mRNA transcript
342	of nucleotides in start codon.
343	"""
UNCOV 344	return [self.offset(position) for position in self.start_codon_positions]	×
345
346	@memoized_property	3✔
347	def stop_codon_unspliced_offsets(self):	3✔
348	"""
349	Offsets from start of unspliced pre-mRNA transcript
350	of nucleotides in stop codon.
351	"""
UNCOV 352	return [self.offset(position) for position in self.stop_codon_positions]	×
353
354	def _contiguous_offsets(self, offsets):	3✔
355	"""
356	Sorts the input list of integer offsets,
357	ensures that values are contiguous.
358	"""
359	offsets.sort()	3✔
360	for i in range(len(offsets) - 1):	3✔
361	if offsets[i] + 1 != offsets[i + 1]:	3✔
UNCOV 362	raise ValueError("Offsets not contiguous: %s" % (offsets,))	×
363	return offsets	3✔
364
365	@memoized_property	3✔
366	def start_codon_spliced_offsets(self):	3✔
367	"""
368	Offsets from start of spliced mRNA transcript
369	of nucleotides in start codon.
370	"""
371	offsets = [	3✔
372	self.spliced_offset(position) for position in self.start_codon_positions
373	]
374	return self._contiguous_offsets(offsets)	3✔
375
376	@memoized_property	3✔
377	def stop_codon_spliced_offsets(self):	3✔
378	"""
379	Offsets from start of spliced mRNA transcript
380	of nucleotides in stop codon.
381	"""
382	offsets = [	3✔
383	self.spliced_offset(position) for position in self.stop_codon_positions
384	]
385	return self._contiguous_offsets(offsets)	3✔
386
387	@memoized_property	3✔
388	def coding_sequence_position_ranges(self):	3✔
389	"""
390	Return absolute chromosome position ranges for CDS fragments
391	of this transcript
392	"""
UNCOV 393	return self._transcript_feature_position_ranges("CDS")	×
394
395	@memoized_property	3✔
396	def complete(self):	3✔
397	"""
398	Consider a transcript complete if it has start and stop codons and
399	a coding sequence whose length is divisible by 3
400	"""
401	return (	3✔
402	self.contains_start_codon
403	and self.start_codon_complete
404	and self.contains_stop_codon
405	and self.coding_sequence is not None
406	and len(self.coding_sequence) % 3 == 0
407	)
408
409	@memoized_property	3✔
410	def sequence(self):	3✔
411	"""
412	Spliced cDNA sequence of transcript
413	(includes 5" UTR, coding sequence, and 3" UTR)
414	"""
415	transcript_id = self.transcript_id	3✔
416	if transcript_id.startswith("ENS"):	3✔
417	transcript_id = transcript_id.rsplit(".", 1)[0]	3✔
418	return self.genome.transcript_sequences.get(transcript_id)	3✔
419
420	@memoized_property	3✔
421	def first_start_codon_spliced_offset(self):	3✔
422	"""
423	Offset of first nucleotide in start codon into the spliced mRNA
424	(excluding introns)
425	"""
426	start_offsets = self.start_codon_spliced_offsets	3✔
427	return min(start_offsets)	3✔
428
429	@memoized_property	3✔
430	def last_stop_codon_spliced_offset(self):	3✔
431	"""
432	Offset of last nucleotide in stop codon into the spliced mRNA
433	(excluding introns)
434	"""
435	stop_offsets = self.stop_codon_spliced_offsets	3✔
436	return max(stop_offsets)	3✔
437
438	@memoized_property	3✔
439	def coding_sequence(self):	3✔
440	"""
441	cDNA coding sequence (from start codon to stop codon, without
442	any introns)
443	"""
444	if self.sequence is None:	3✔
445	return None	×
446
447	start = self.first_start_codon_spliced_offset	3✔
448	end = self.last_stop_codon_spliced_offset	3✔
449
450	# If start codon is the at nucleotide offsets [3,4,5] and
451	# stop codon is at nucleotide offsets [20,21,22]
452	# then start = 3 and end = 22.
453	#
454	# Adding 1 to end since Python uses non-inclusive ends in slices/ranges.
455
456	# pylint: disable=invalid-slice-index
457	# TODO(tavi) Figure out pylint is not happy with this slice
458	return self.sequence[start : end + 1]	3✔
459
460	@memoized_property	3✔
461	def five_prime_utr_sequence(self):	3✔
462	"""
463	cDNA sequence of 5' UTR
464	(untranslated region at the beginning of the transcript)
465	"""
466	# pylint: disable=invalid-slice-index
467	# TODO(tavi) Figure out pylint is not happy with this slice
468	return self.sequence[: self.first_start_codon_spliced_offset]	3✔
469
470	@memoized_property	3✔
471	def three_prime_utr_sequence(self):	3✔
472	"""
473	cDNA sequence of 3' UTR
474	(untranslated region at the end of the transcript)
475	"""
476	return self.sequence[self.last_stop_codon_spliced_offset + 1 :]	3✔
477
478	@memoized_property	3✔
479	def protein_id(self):	3✔
480	result_tuple = self.db.query_one(	3✔
481	select_column_names=["protein_id"],
482	filter_column="transcript_id",
483	filter_value=self.id,
484	feature="CDS",
485	distinct=True,
486	required=False,
487	)
488	if result_tuple:	3✔
489	return result_tuple[0]	3✔
490	else:
491	return None	×
492
493	@memoized_property	3✔
494	def protein_sequence(self):	3✔
495	if self.protein_id:	3✔
496	return self.genome.protein_sequences.get(self.protein_id)	3✔
497	else:
498	return None	×

openvax / pyensembl / 8833752981

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous