• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

openvax / pyensembl / 8833752981

25 Apr 2024 01:44PM UTC coverage: 83.019% (-13.0%) from 96.024%
8833752981

push

github

iskandr
fixed pyensembl list with lower min version

2 of 2 new or added lines in 2 files covered. (100.0%)

211 existing lines in 11 files now uncovered.

1320 of 1590 relevant lines covered (83.02%)

2.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.56
/pyensembl/transcript.py
1
# Licensed under the Apache License, Version 2.0 (the "License");
2
# you may not use this file except in compliance with the License.
3
# You may obtain a copy of the License at
4
#
5
#     http://www.apache.org/licenses/LICENSE-2.0
6
#
7
# Unless required by applicable law or agreed to in writing, software
8
# distributed under the License is distributed on an "AS IS" BASIS,
9
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
# See the License for the specific language governing permissions and
11
# limitations under the License.
12

13
from memoized_property import memoized_property
3✔
14

15
from .common import memoize
3✔
16
from .locus_with_genome import LocusWithGenome
3✔
17

18

19
class Transcript(LocusWithGenome):
3✔
20
    """
21
    Transcript encompasses the locus, exons, and sequence of a transcript.
22

23
    Lazily fetches sequence in case we"re constructing many Transcripts
24
    and not using the sequence, avoid the memory/performance overhead
25
    of fetching and storing sequences from a FASTA file.
26
    """
27

28
    def __init__(
3✔
29
        self,
30
        transcript_id,
31
        transcript_name,
32
        contig,
33
        start,
34
        end,
35
        strand,
36
        biotype,
37
        gene_id,
38
        genome,
39
        support_level=None,
40
    ):
41
        LocusWithGenome.__init__(
3✔
42
            self,
43
            contig=contig,
44
            start=start,
45
            end=end,
46
            strand=strand,
47
            biotype=biotype,
48
            genome=genome,
49
        )
50
        self.transcript_id = transcript_id
3✔
51
        self.transcript_name = transcript_name
3✔
52
        self.gene_id = gene_id
3✔
53
        self.support_level = support_level
3✔
54

55
    @property
3✔
56
    def id(self):
3✔
57
        """
58
        Alias for transcript_id necessary for backward compatibility.
59
        """
60
        return self.transcript_id
3✔
61

62
    @property
3✔
63
    def name(self):
3✔
64
        """
65
        Alias for transcript_name necessary for backward compatibility.
66
        """
67
        return self.transcript_name
3✔
68

69
    def __str__(self):
3✔
70
        return (
3✔
71
            "Transcript(transcript_id='%s',"
72
            " transcript_name='%s',"
73
            " gene_id='%s',"
74
            " biotype='%s',"
75
            " contig='%s',"
76
            " start=%d,"
77
            " end=%d, strand='%s', genome='%s')"
78
        ) % (
79
            self.transcript_id,
80
            self.name,
81
            self.gene_id,
82
            self.biotype,
83
            self.contig,
84
            self.start,
85
            self.end,
86
            self.strand,
87
            self.genome.reference_name,
88
        )
89

90
    def __len__(self):
3✔
91
        """
92
        Length of a transcript is the sum of its exon lengths
93
        """
94
        return sum(len(exon) for exon in self.exons)
3✔
95

96
    def __eq__(self, other):
3✔
97
        return (
3✔
98
            other.__class__ is Transcript
99
            and self.id == other.id
100
            and self.genome == other.genome
101
        )
102

103
    def __hash__(self):
3✔
104
        return hash(self.id)
3✔
105

106
    def to_dict(self):
3✔
107
        state_dict = LocusWithGenome.to_dict(self)
3✔
108
        state_dict["transcript_id"] = self.transcript_id
3✔
109
        state_dict["transcript_name"] = self.name
3✔
110
        state_dict["gene_id"] = self.gene_id
3✔
111
        state_dict["support_level"] = self.support_level
3✔
112
        return state_dict
3✔
113

114
    @property
3✔
115
    def gene(self):
3✔
116
        return self.genome.gene_by_id(self.gene_id)
3✔
117

118
    @property
3✔
119
    def gene_name(self):
3✔
UNCOV
120
        return self.gene.name
×
121

122
    @property
3✔
123
    def exons(self):
3✔
124
        # need to look up exon_number alongside ID since each exon may
125
        # appear in multiple transcripts and have a different exon number
126
        # in each transcript
127
        columns = ["exon_number", "exon_id"]
3✔
128
        exon_numbers_and_ids = self.db.query(
3✔
129
            columns, filter_column="transcript_id", filter_value=self.id, feature="exon"
130
        )
131

132
        # fill this list in its correct order (by exon_number) by using
133
        # the exon_number as a 1-based list offset
134
        exons = [None] * len(exon_numbers_and_ids)
3✔
135

136
        for exon_number, exon_id in exon_numbers_and_ids:
3✔
137
            exon = self.genome.exon_by_id(exon_id)
3✔
138
            if exon is None:
3✔
UNCOV
139
                raise ValueError(
×
140
                    "Missing exon %s for transcript %s" % (exon_number, self.id)
141
                )
142
            exon_number = int(exon_number)
3✔
143
            if exon_number < 1:
3✔
UNCOV
144
                raise ValueError("Invalid exon number: %s" % exon_number)
×
145
            elif exon_number > len(exons):
3✔
UNCOV
146
                raise ValueError(
×
147
                    "Invalid exon number: %s (max expected = %d)"
148
                    % (exon_number, len(exons))
149
                )
150

151
            # exon_number is 1-based, convert to list index by subtracting 1
152
            exon_idx = exon_number - 1
3✔
153
            exons[exon_idx] = exon
3✔
154
        return exons
3✔
155

156
    # possible annotations associated with transcripts
157
    _TRANSCRIPT_FEATURES = {"start_codon", "stop_codon", "UTR", "CDS"}
3✔
158

159
    @memoize
3✔
160
    def _transcript_feature_position_ranges(self, feature, required=True):
3✔
161
        """
162
        Find start/end chromosomal position range of features
163
        (such as start codon) for this transcript.
164
        """
165
        if feature not in self._TRANSCRIPT_FEATURES:
3✔
UNCOV
166
            raise ValueError("Invalid transcript feature: %s" % feature)
×
167

168
        results = self.db.query(
3✔
169
            select_column_names=["start", "end"],
170
            filter_column="transcript_id",
171
            filter_value=self.id,
172
            feature=feature,
173
        )
174

175
        if required and len(results) == 0:
3✔
UNCOV
176
            raise ValueError(
×
177
                "Transcript %s does not contain feature %s" % (self.id, feature)
178
            )
179
        return results
3✔
180

181
    @memoize
3✔
182
    def _transcript_feature_positions(self, feature):
3✔
183
        """
184
        Get unique positions for feature, raise an error if feature is absent.
185
        """
186
        ranges = self._transcript_feature_position_ranges(feature, required=True)
3✔
187
        results = []
3✔
188
        # a feature (such as a stop codon), maybe be split over multiple
189
        # contiguous ranges. Collect all the nucleotide positions into a
190
        # single list.
191
        for start, end in ranges:
3✔
192
            # since ranges are [inclusive, inclusive] and
193
            # Python ranges are [inclusive, exclusive) we have to increment
194
            # the end position
195
            for position in range(start, end + 1):
3✔
196
                if position in results:
3✔
UNCOV
197
                    raise ValueError(
×
198
                        "Repeated position %d for %s" % (position, feature)
199
                    )
200
                results.append(position)
3✔
201
        return results
3✔
202

203
    @memoize
3✔
204
    def _codon_positions(self, feature):
3✔
205
        """
206
        Parameters
207
        ----------
208
        feature : str
209
            Possible values are "start_codon" or "stop_codon"
210

211
        Returns list of three chromosomal positions.
212
        """
213
        results = self._transcript_feature_positions(feature)
3✔
214
        if len(results) != 3:
3✔
UNCOV
215
            raise ValueError(
×
216
                "Expected 3 positions for %s of %s but got %d"
217
                % (feature, self.id, len(results))
218
            )
219
        return results
3✔
220

221
    @memoized_property
3✔
222
    def contains_start_codon(self):
3✔
223
        """
224
        Does this transcript have an annotated start_codon entry?
225
        """
226
        start_codons = self._transcript_feature_position_ranges(
3✔
227
            "start_codon", required=False
228
        )
229
        return len(start_codons) > 0
3✔
230

231
    @memoized_property
3✔
232
    def contains_stop_codon(self):
3✔
233
        """
234
        Does this transcript have an annotated stop_codon entry?
235
        """
236
        stop_codons = self._transcript_feature_position_ranges(
3✔
237
            "stop_codon", required=False
238
        )
239
        return len(stop_codons) > 0
3✔
240

241
    @memoized_property
3✔
242
    def start_codon_complete(self):
3✔
243
        """
244
        Does the start codon span 3 genomic positions?
245
        """
246
        try:
3✔
247
            self._codon_positions("start_codon")
3✔
UNCOV
248
        except ValueError:
×
UNCOV
249
            return False
×
250
        return True
3✔
251

252
    @memoized_property
3✔
253
    def start_codon_positions(self):
3✔
254
        """
255
        Chromosomal positions of nucleotides in start codon.
256
        """
257
        return self._codon_positions("start_codon")
3✔
258

259
    @memoized_property
3✔
260
    def stop_codon_positions(self):
3✔
261
        """
262
        Chromosomal positions of nucleotides in stop codon.
263
        """
264
        return self._codon_positions("stop_codon")
3✔
265

266
    @memoized_property
3✔
267
    def exon_intervals(self):
3✔
268
        """List of (start,end) tuples for each exon of this transcript,
269
        in the order specified by the 'exon_number' column of the
270
        exon table.
271
        """
UNCOV
272
        results = self.db.query(
×
273
            select_column_names=["exon_number", "start", "end"],
274
            filter_column="transcript_id",
275
            filter_value=self.id,
276
            feature="exon",
277
        )
UNCOV
278
        sorted_intervals = [None] * len(results)
×
UNCOV
279
        for exon_number, start, end in results:
×
UNCOV
280
            sorted_intervals[int(exon_number) - 1] = (start, end)
×
UNCOV
281
        return sorted_intervals
×
282

283
    def spliced_offset(self, position):
3✔
284
        """
285
        Convert from an absolute chromosomal position to the offset into
286
        this transcript"s spliced mRNA.
287

288
        Position must be inside some exon (otherwise raise exception).
289
        """
290
        if type(position) is not int:
3✔
UNCOV
291
            raise TypeError(
×
292
                "Position argument must be an integer, got %s : %s"
293
                % (position, type(position))
294
            )
295

296
        if position < self.start or position > self.end:
3✔
UNCOV
297
            raise ValueError(
×
298
                "Invalid position: %d (must be between %d and %d)"
299
                % (position, self.start, self.end)
300
            )
301

302
        # offset from beginning of unspliced transcript (including introns)
303
        unspliced_offset = self.offset(position)
3✔
304
        total_spliced_offset = 0
3✔
305

306
        # traverse exons in order of their appearance on the strand
307
        # Since absolute positions may decrease if on the negative strand,
308
        # we instead use unspliced offsets to get always increasing indices.
309
        #
310
        # Example:
311
        #
312
        # Exon Name:                exon 1                exon 2
313
        # Spliced Offset:           123456                789...
314
        # Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii...
315
        for exon in self.exons:
3✔
316
            exon_unspliced_start, exon_unspliced_end = self.offset_range(
3✔
317
                exon.start, exon.end
318
            )
319
            # If the relative position is not within this exon, keep a running
320
            # total of the total exonic length-so-far.
321
            #
322
            # Otherwise, if the relative position is within an exon, get its
323
            # offset into that exon by subtracting the exon"s relative start
324
            # position from the relative position. Add that to the total exonic
325
            # length-so-far.
326
            if exon_unspliced_start <= unspliced_offset <= exon_unspliced_end:
3✔
327
                # all offsets are base 0, can be used as indices into
328
                # sequence string
329
                exon_offset = unspliced_offset - exon_unspliced_start
3✔
330
                return total_spliced_offset + exon_offset
3✔
331
            else:
332
                exon_length = len(exon)  # exon_end_position - exon_start_position + 1
3✔
333
                total_spliced_offset += exon_length
3✔
UNCOV
334
        raise ValueError(
×
335
            "Couldn't find position %d on any exon of %s" % (position, self.id)
336
        )
337

338
    @memoized_property
3✔
339
    def start_codon_unspliced_offsets(self):
3✔
340
        """
341
        Offsets from start of unspliced pre-mRNA transcript
342
        of nucleotides in start codon.
343
        """
UNCOV
344
        return [self.offset(position) for position in self.start_codon_positions]
×
345

346
    @memoized_property
3✔
347
    def stop_codon_unspliced_offsets(self):
3✔
348
        """
349
        Offsets from start of unspliced pre-mRNA transcript
350
        of nucleotides in stop codon.
351
        """
UNCOV
352
        return [self.offset(position) for position in self.stop_codon_positions]
×
353

354
    def _contiguous_offsets(self, offsets):
3✔
355
        """
356
        Sorts the input list of integer offsets,
357
        ensures that values are contiguous.
358
        """
359
        offsets.sort()
3✔
360
        for i in range(len(offsets) - 1):
3✔
361
            if offsets[i] + 1 != offsets[i + 1]:
3✔
UNCOV
362
                raise ValueError("Offsets not contiguous: %s" % (offsets,))
×
363
        return offsets
3✔
364

365
    @memoized_property
3✔
366
    def start_codon_spliced_offsets(self):
3✔
367
        """
368
        Offsets from start of spliced mRNA transcript
369
        of nucleotides in start codon.
370
        """
371
        offsets = [
3✔
372
            self.spliced_offset(position) for position in self.start_codon_positions
373
        ]
374
        return self._contiguous_offsets(offsets)
3✔
375

376
    @memoized_property
3✔
377
    def stop_codon_spliced_offsets(self):
3✔
378
        """
379
        Offsets from start of spliced mRNA transcript
380
        of nucleotides in stop codon.
381
        """
382
        offsets = [
3✔
383
            self.spliced_offset(position) for position in self.stop_codon_positions
384
        ]
385
        return self._contiguous_offsets(offsets)
3✔
386

387
    @memoized_property
3✔
388
    def coding_sequence_position_ranges(self):
3✔
389
        """
390
        Return absolute chromosome position ranges for CDS fragments
391
        of this transcript
392
        """
UNCOV
393
        return self._transcript_feature_position_ranges("CDS")
×
394

395
    @memoized_property
3✔
396
    def complete(self):
3✔
397
        """
398
        Consider a transcript complete if it has start and stop codons and
399
        a coding sequence whose length is divisible by 3
400
        """
401
        return (
3✔
402
            self.contains_start_codon
403
            and self.start_codon_complete
404
            and self.contains_stop_codon
405
            and self.coding_sequence is not None
406
            and len(self.coding_sequence) % 3 == 0
407
        )
408

409
    @memoized_property
3✔
410
    def sequence(self):
3✔
411
        """
412
        Spliced cDNA sequence of transcript
413
        (includes 5" UTR, coding sequence, and 3" UTR)
414
        """
415
        transcript_id = self.transcript_id
3✔
416
        if transcript_id.startswith("ENS"):
3✔
417
            transcript_id = transcript_id.rsplit(".", 1)[0]
3✔
418
        return self.genome.transcript_sequences.get(transcript_id)
3✔
419

420
    @memoized_property
3✔
421
    def first_start_codon_spliced_offset(self):
3✔
422
        """
423
        Offset of first nucleotide in start codon into the spliced mRNA
424
        (excluding introns)
425
        """
426
        start_offsets = self.start_codon_spliced_offsets
3✔
427
        return min(start_offsets)
3✔
428

429
    @memoized_property
3✔
430
    def last_stop_codon_spliced_offset(self):
3✔
431
        """
432
        Offset of last nucleotide in stop codon into the spliced mRNA
433
        (excluding introns)
434
        """
435
        stop_offsets = self.stop_codon_spliced_offsets
3✔
436
        return max(stop_offsets)
3✔
437

438
    @memoized_property
3✔
439
    def coding_sequence(self):
3✔
440
        """
441
        cDNA coding sequence (from start codon to stop codon, without
442
        any introns)
443
        """
444
        if self.sequence is None:
3✔
445
            return None
×
446

447
        start = self.first_start_codon_spliced_offset
3✔
448
        end = self.last_stop_codon_spliced_offset
3✔
449

450
        # If start codon is the at nucleotide offsets [3,4,5] and
451
        # stop codon is at nucleotide offsets  [20,21,22]
452
        # then start = 3 and end = 22.
453
        #
454
        # Adding 1 to end since Python uses non-inclusive ends in slices/ranges.
455

456
        # pylint: disable=invalid-slice-index
457
        # TODO(tavi) Figure out pylint is not happy with this slice
458
        return self.sequence[start : end + 1]
3✔
459

460
    @memoized_property
3✔
461
    def five_prime_utr_sequence(self):
3✔
462
        """
463
        cDNA sequence of 5' UTR
464
        (untranslated region at the beginning of the transcript)
465
        """
466
        # pylint: disable=invalid-slice-index
467
        # TODO(tavi) Figure out pylint is not happy with this slice
468
        return self.sequence[: self.first_start_codon_spliced_offset]
3✔
469

470
    @memoized_property
3✔
471
    def three_prime_utr_sequence(self):
3✔
472
        """
473
        cDNA sequence of 3' UTR
474
        (untranslated region at the end of the transcript)
475
        """
476
        return self.sequence[self.last_stop_codon_spliced_offset + 1 :]
3✔
477

478
    @memoized_property
3✔
479
    def protein_id(self):
3✔
480
        result_tuple = self.db.query_one(
3✔
481
            select_column_names=["protein_id"],
482
            filter_column="transcript_id",
483
            filter_value=self.id,
484
            feature="CDS",
485
            distinct=True,
486
            required=False,
487
        )
488
        if result_tuple:
3✔
489
            return result_tuple[0]
3✔
490
        else:
491
            return None
×
492

493
    @memoized_property
3✔
494
    def protein_sequence(self):
3✔
495
        if self.protein_id:
3✔
496
            return self.genome.protein_sequences.get(self.protein_id)
3✔
497
        else:
498
            return None
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc