• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

openvax / pyensembl / 25776025193

13 May 2026 03:14AM UTC coverage: 84.971% (+0.06%) from 84.907%
25776025193

push

github

web-flow
Fix #335 (part 2): tolerate versioned IDs in protein/transcript FASTA lookups (#350)

GENCODE GTFs embed the assembly version directly in the protein_id and
transcript_id attributes (e.g. protein_id "ENSP00000123456.3"), while
pyensembl's FASTA parser strips ENS.N suffixes from headers. The literal
lookup with the versioned ID therefore missed, and Transcript.protein_sequence
returned None for GENCODE FASTAs even when the sequence was present.

Adds sequence_data.sequence_lookup_with_ens_fallback() which tries the
literal id first (handles both unversioned Ensembl GTFs and any future
FASTA that preserves versions) and falls back to the version-stripped
form for ENS.N identifiers.

Wires the helper into:
- Transcript.sequence (previously stripped unconditionally; now tries
  versioned first then strips)
- Transcript.protein_sequence (previously missing for GENCODE GTFs)
- Genome.transcript_sequence(transcript_id)
- Genome.protein_sequence(protein_id)

Bump version to 2.9.6.

16 of 18 new or added lines in 4 files covered. (88.89%)

1 existing line in 1 file now uncovered.

1600 of 1883 relevant lines covered (84.97%)

3.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.04
/pyensembl/transcript.py
1
# Licensed under the Apache License, Version 2.0 (the "License");
2
# you may not use this file except in compliance with the License.
3
# You may obtain a copy of the License at
4
#
5
#     http://www.apache.org/licenses/LICENSE-2.0
6
#
7
# Unless required by applicable law or agreed to in writing, software
8
# distributed under the License is distributed on an "AS IS" BASIS,
9
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
# See the License for the specific language governing permissions and
11
# limitations under the License.
12

13
from memoized_property import memoized_property
4✔
14

15
from .common import memoize, merge_intervals
4✔
16
from .exon import Exon
4✔
17
from .locus_with_genome import LocusWithGenome
4✔
18
from .sequence_data import sequence_lookup_with_ens_fallback
4✔
19

20

21
# Back-compat alias for callers that imported the private helper.
22
_merge_ranges = merge_intervals
4✔
23

24

25
class Transcript(LocusWithGenome):
4✔
26
    """
27
    Transcript encompasses the locus, exons, and sequence of a transcript.
28

29
    Lazily fetches sequence in case we"re constructing many Transcripts
30
    and not using the sequence, avoid the memory/performance overhead
31
    of fetching and storing sequences from a FASTA file.
32
    """
33

34
    def __init__(
4✔
35
        self,
36
        transcript_id,
37
        transcript_name,
38
        contig,
39
        start,
40
        end,
41
        strand,
42
        biotype,
43
        gene_id,
44
        genome,
45
        support_level=None,
46
    ):
47
        LocusWithGenome.__init__(
4✔
48
            self,
49
            contig=contig,
50
            start=start,
51
            end=end,
52
            strand=strand,
53
            biotype=biotype,
54
            genome=genome,
55
        )
56
        self.transcript_id = transcript_id
4✔
57
        self.transcript_name = transcript_name
4✔
58
        self.gene_id = gene_id
4✔
59
        self.support_level = support_level
4✔
60

61
    @property
4✔
62
    def id(self):
4✔
63
        """
64
        Alias for transcript_id necessary for backward compatibility.
65
        """
66
        return self.transcript_id
4✔
67

68
    @property
4✔
69
    def name(self):
4✔
70
        """
71
        Alias for transcript_name necessary for backward compatibility.
72
        """
73
        return self.transcript_name
4✔
74

75
    def __str__(self):
4✔
76
        return (
4✔
77
            "Transcript(transcript_id='%s',"
78
            " transcript_name='%s',"
79
            " gene_id='%s',"
80
            " biotype='%s',"
81
            " contig='%s',"
82
            " start=%d,"
83
            " end=%d, strand='%s', genome='%s')"
84
        ) % (
85
            self.transcript_id,
86
            self.name,
87
            self.gene_id,
88
            self.biotype,
89
            self.contig,
90
            self.start,
91
            self.end,
92
            self.strand,
93
            self.genome.reference_name,
94
        )
95

96
    def __len__(self):
4✔
97
        """
98
        Length of a transcript is the sum of its exon lengths
99
        """
100
        return sum(len(exon) for exon in self.exons)
4✔
101

102
    def __eq__(self, other):
4✔
103
        return (
4✔
104
            other.__class__ is Transcript
105
            and self.id == other.id
106
            and self.genome == other.genome
107
        )
108

109
    def __hash__(self):
4✔
110
        return hash(self.id)
4✔
111

112
    def to_dict(self):
4✔
113
        state_dict = LocusWithGenome.to_dict(self)
4✔
114
        state_dict["transcript_id"] = self.transcript_id
4✔
115
        state_dict["transcript_name"] = self.name
4✔
116
        state_dict["gene_id"] = self.gene_id
4✔
117
        state_dict["support_level"] = self.support_level
4✔
118
        return state_dict
4✔
119

120
    @property
4✔
121
    def gene(self):
4✔
122
        return self.genome.gene_by_id(self.gene_id)
4✔
123

124
    @property
4✔
125
    def gene_name(self):
4✔
126
        return self.gene.name
4✔
127

128
    @property
4✔
129
    def exons(self):
4✔
130
        # need to look up exon_number alongside ID since each exon may
131
        # appear in multiple transcripts and have a different exon number
132
        # in each transcript.
133
        # Older or non-Ensembl GTFs may omit the exon_id attribute, in
134
        # which case we build Exon objects directly from the exon row
135
        # and synthesize a stable per-transcript ID.
136
        has_exon_id = self.db.column_exists("exon", "exon_id")
4✔
137
        if has_exon_id:
4✔
138
            columns = ["exon_number", "exon_id"]
4✔
139
        else:
140
            columns = ["exon_number", "seqname", "start", "end", "strand"]
4✔
141
        rows = self.db.query(
4✔
142
            columns, filter_column="transcript_id", filter_value=self.id, feature="exon"
143
        )
144

145
        # fill this list in its correct order (by exon_number) by using
146
        # the exon_number as a 1-based list offset
147
        exons = [None] * len(rows)
4✔
148

149
        for row in rows:
4✔
150
            exon_number = int(row[0])
4✔
151
            if exon_number < 1:
4✔
152
                raise ValueError("Invalid exon number: %s" % exon_number)
×
153
            elif exon_number > len(exons):
4✔
154
                raise ValueError(
×
155
                    "Invalid exon number: %s (max expected = %d)"
156
                    % (exon_number, len(exons))
157
                )
158

159
            if has_exon_id:
4✔
160
                exon_id = row[1]
4✔
161
                exon = self.genome.exon_by_id(exon_id)
4✔
162
                if exon is None:
4✔
163
                    raise ValueError(
×
164
                        "Missing exon %s for transcript %s" % (exon_number, self.id)
165
                    )
166
            else:
167
                _, seqname, start, end, strand = row
4✔
168
                exon = Exon(
4✔
169
                    exon_id="%s_exon_%d" % (self.id, exon_number),
170
                    contig=seqname,
171
                    start=start,
172
                    end=end,
173
                    strand=strand,
174
                    gene_name=self.gene_name,
175
                    gene_id=self.gene_id,
176
                )
177

178
            # exon_number is 1-based, convert to list index by subtracting 1
179
            exons[exon_number - 1] = exon
4✔
180
        return exons
4✔
181

182
    # possible annotations associated with transcripts
183
    _TRANSCRIPT_FEATURES = {"start_codon", "stop_codon", "UTR", "CDS"}
4✔
184

185
    @memoize
4✔
186
    def _transcript_feature_position_ranges(self, feature, required=True):
4✔
187
        """
188
        Find start/end chromosomal position range of features
189
        (such as start codon) for this transcript.
190
        """
191
        if feature not in self._TRANSCRIPT_FEATURES:
4✔
192
            raise ValueError("Invalid transcript feature: %s" % feature)
×
193

194
        results = self.db.query(
4✔
195
            select_column_names=["start", "end"],
196
            filter_column="transcript_id",
197
            filter_value=self.id,
198
            feature=feature,
199
        )
200

201
        if required and len(results) == 0:
4✔
202
            raise ValueError(
×
203
                "Transcript %s does not contain feature %s" % (self.id, feature)
204
            )
205
        return results
4✔
206

207
    @memoize
4✔
208
    def _transcript_feature_positions(self, feature):
4✔
209
        """
210
        Get unique positions for feature, raise an error if feature is absent.
211
        """
212
        ranges = self._transcript_feature_position_ranges(feature, required=True)
4✔
213
        results = []
4✔
214
        # a feature (such as a stop codon), maybe be split over multiple
215
        # contiguous ranges. Collect all the nucleotide positions into a
216
        # single list.
217
        for start, end in ranges:
4✔
218
            # since ranges are [inclusive, inclusive] and
219
            # Python ranges are [inclusive, exclusive) we have to increment
220
            # the end position
221
            for position in range(start, end + 1):
4✔
222
                if position in results:
4✔
223
                    raise ValueError(
×
224
                        "Repeated position %d for %s" % (position, feature)
225
                    )
226
                results.append(position)
4✔
227
        return results
4✔
228

229
    @memoize
4✔
230
    def _codon_positions(self, feature):
4✔
231
        """
232
        Parameters
233
        ----------
234
        feature : str
235
            Possible values are "start_codon" or "stop_codon"
236

237
        Returns list of three chromosomal positions.
238
        """
239
        results = self._transcript_feature_positions(feature)
4✔
240
        if len(results) != 3:
4✔
241
            raise ValueError(
×
242
                "Expected 3 positions for %s of %s but got %d"
243
                % (feature, self.id, len(results))
244
            )
245
        return results
4✔
246

247
    @memoized_property
4✔
248
    def contains_start_codon(self):
4✔
249
        """
250
        Does this transcript have an annotated start_codon entry?
251
        """
252
        start_codons = self._transcript_feature_position_ranges(
4✔
253
            "start_codon", required=False
254
        )
255
        return len(start_codons) > 0
4✔
256

257
    @memoized_property
4✔
258
    def contains_stop_codon(self):
4✔
259
        """
260
        Does this transcript have an annotated stop_codon entry?
261
        """
262
        stop_codons = self._transcript_feature_position_ranges(
4✔
263
            "stop_codon", required=False
264
        )
265
        return len(stop_codons) > 0
4✔
266

267
    @memoized_property
4✔
268
    def start_codon_complete(self):
4✔
269
        """
270
        Does the start codon span 3 genomic positions?
271
        """
272
        try:
4✔
273
            self._codon_positions("start_codon")
4✔
274
        except ValueError:
×
275
            return False
×
276
        return True
4✔
277

278
    @memoized_property
4✔
279
    def start_codon_positions(self):
4✔
280
        """
281
        Chromosomal positions of nucleotides in start codon.
282
        """
283
        return self._codon_positions("start_codon")
4✔
284

285
    @memoized_property
4✔
286
    def stop_codon_positions(self):
4✔
287
        """
288
        Chromosomal positions of nucleotides in stop codon.
289
        """
290
        return self._codon_positions("stop_codon")
4✔
291

292
    @memoized_property
4✔
293
    def exon_intervals(self):
4✔
294
        """List of (start,end) tuples for each exon of this transcript,
295
        in the order specified by the 'exon_number' column of the
296
        exon table.
297
        """
298
        results = self.db.query(
×
299
            select_column_names=["exon_number", "start", "end"],
300
            filter_column="transcript_id",
301
            filter_value=self.id,
302
            feature="exon",
303
        )
304
        sorted_intervals = [None] * len(results)
×
305
        for exon_number, start, end in results:
×
306
            sorted_intervals[int(exon_number) - 1] = (start, end)
×
307
        return sorted_intervals
×
308

309
    def spliced_offset(self, position):
4✔
310
        """
311
        Convert from an absolute chromosomal position to the offset into
312
        this transcript"s spliced mRNA.
313

314
        Position must be inside some exon (otherwise raise exception).
315
        """
316
        if type(position) is not int:
4✔
317
            raise TypeError(
×
318
                "Position argument must be an integer, got %s : %s"
319
                % (position, type(position))
320
            )
321

322
        if position < self.start or position > self.end:
4✔
323
            raise ValueError(
×
324
                "Invalid position: %d (must be between %d and %d)"
325
                % (position, self.start, self.end)
326
            )
327

328
        # offset from beginning of unspliced transcript (including introns)
329
        unspliced_offset = self.offset(position)
4✔
330
        total_spliced_offset = 0
4✔
331

332
        # traverse exons in order of their appearance on the strand
333
        # Since absolute positions may decrease if on the negative strand,
334
        # we instead use unspliced offsets to get always increasing indices.
335
        #
336
        # Example:
337
        #
338
        # Exon Name:                exon 1                exon 2
339
        # Spliced Offset:           123456                789...
340
        # Intron vs. Exon: ...iiiiiieeeeeeiiiiiiiiiiiiiiiieeeeeeiiiiiiiiiii...
341
        for exon in self.exons:
4✔
342
            exon_unspliced_start, exon_unspliced_end = self.offset_range(
4✔
343
                exon.start, exon.end
344
            )
345
            # If the relative position is not within this exon, keep a running
346
            # total of the total exonic length-so-far.
347
            #
348
            # Otherwise, if the relative position is within an exon, get its
349
            # offset into that exon by subtracting the exon"s relative start
350
            # position from the relative position. Add that to the total exonic
351
            # length-so-far.
352
            if exon_unspliced_start <= unspliced_offset <= exon_unspliced_end:
4✔
353
                # all offsets are base 0, can be used as indices into
354
                # sequence string
355
                exon_offset = unspliced_offset - exon_unspliced_start
4✔
356
                return total_spliced_offset + exon_offset
4✔
357
            else:
358
                exon_length = len(exon)  # exon_end_position - exon_start_position + 1
4✔
359
                total_spliced_offset += exon_length
4✔
360
        raise ValueError(
×
361
            "Couldn't find position %d on any exon of %s" % (position, self.id)
362
        )
363

364
    @memoized_property
4✔
365
    def start_codon_unspliced_offsets(self):
4✔
366
        """
367
        Offsets from start of unspliced pre-mRNA transcript
368
        of nucleotides in start codon.
369
        """
370
        return [self.offset(position) for position in self.start_codon_positions]
×
371

372
    @memoized_property
4✔
373
    def stop_codon_unspliced_offsets(self):
4✔
374
        """
375
        Offsets from start of unspliced pre-mRNA transcript
376
        of nucleotides in stop codon.
377
        """
378
        return [self.offset(position) for position in self.stop_codon_positions]
×
379

380
    def _contiguous_offsets(self, offsets):
4✔
381
        """
382
        Sorts the input list of integer offsets,
383
        ensures that values are contiguous.
384
        """
385
        offsets.sort()
4✔
386
        for i in range(len(offsets) - 1):
4✔
387
            if offsets[i] + 1 != offsets[i + 1]:
4✔
388
                raise ValueError("Offsets not contiguous: %s" % (offsets,))
×
389
        return offsets
4✔
390

391
    @memoized_property
4✔
392
    def start_codon_spliced_offsets(self):
4✔
393
        """
394
        Offsets from start of spliced mRNA transcript
395
        of nucleotides in start codon.
396
        """
397
        offsets = [
4✔
398
            self.spliced_offset(position) for position in self.start_codon_positions
399
        ]
400
        return self._contiguous_offsets(offsets)
4✔
401

402
    @memoized_property
4✔
403
    def stop_codon_spliced_offsets(self):
4✔
404
        """
405
        Offsets from start of spliced mRNA transcript
406
        of nucleotides in stop codon.
407
        """
408
        offsets = [
4✔
409
            self.spliced_offset(position) for position in self.stop_codon_positions
410
        ]
411
        return self._contiguous_offsets(offsets)
4✔
412

413
    @memoized_property
4✔
414
    def coding_sequence_position_ranges(self):
4✔
415
        """
416
        Return absolute chromosome position ranges for CDS fragments
417
        of this transcript, including the stop codon (which Ensembl
418
        encodes as a separate feature from the CDS).
419
        """
420
        ranges = list(self._transcript_feature_position_ranges("CDS"))
4✔
421
        if self.contains_stop_codon:
4✔
422
            ranges.extend(
4✔
423
                self._transcript_feature_position_ranges("stop_codon", required=False)
424
            )
425
        return merge_intervals(ranges)
4✔
426

427
    @memoized_property
4✔
428
    def complete(self):
4✔
429
        """
430
        Consider a transcript complete if it has start and stop codons and
431
        a coding sequence whose length is divisible by 3
432
        """
433
        return (
4✔
434
            self.contains_start_codon
435
            and self.start_codon_complete
436
            and self.contains_stop_codon
437
            and self.coding_sequence is not None
438
            and len(self.coding_sequence) % 3 == 0
439
        )
440

441
    @memoized_property
4✔
442
    def sequence(self):
4✔
443
        """
444
        Spliced cDNA sequence of transcript
445
        (includes 5" UTR, coding sequence, and 3" UTR)
446
        """
447
        return sequence_lookup_with_ens_fallback(
4✔
448
            self.genome.transcript_sequences, self.transcript_id
449
        )
450

451
    @memoized_property
4✔
452
    def first_start_codon_spliced_offset(self):
4✔
453
        """
454
        Offset of first nucleotide in start codon into the spliced mRNA
455
        (excluding introns)
456
        """
457
        start_offsets = self.start_codon_spliced_offsets
4✔
458
        return min(start_offsets)
4✔
459

460
    @memoized_property
4✔
461
    def last_stop_codon_spliced_offset(self):
4✔
462
        """
463
        Offset of last nucleotide in stop codon into the spliced mRNA
464
        (excluding introns)
465
        """
466
        stop_offsets = self.stop_codon_spliced_offsets
4✔
467
        return max(stop_offsets)
4✔
468

469
    @memoized_property
4✔
470
    def coding_sequence(self):
4✔
471
        """
472
        cDNA coding sequence (from start codon to stop codon, without
473
        any introns)
474
        """
475
        if self.sequence is None:
4✔
476
            return None
×
477

478
        # Some GTF annotations (e.g. fragments in Ensembl Plants) leave a
479
        # protein-coding transcript without a start_codon or stop_codon
480
        # feature. Return None rather than crashing when either endpoint of
481
        # the CDS cannot be located.
482
        if not self.contains_start_codon or not self.contains_stop_codon:
4✔
483
            return None
4✔
484

485
        start = self.first_start_codon_spliced_offset
4✔
486
        end = self.last_stop_codon_spliced_offset
4✔
487

488
        # If start codon is the at nucleotide offsets [3,4,5] and
489
        # stop codon is at nucleotide offsets  [20,21,22]
490
        # then start = 3 and end = 22.
491
        #
492
        # Adding 1 to end since Python uses non-inclusive ends in slices/ranges.
493

494
        # pylint: disable=invalid-slice-index
495
        # TODO(tavi) Figure out pylint is not happy with this slice
496
        return self.sequence[start : end + 1]
4✔
497

498
    @memoized_property
4✔
499
    def five_prime_utr_sequence(self):
4✔
500
        """
501
        cDNA sequence of 5' UTR
502
        (untranslated region at the beginning of the transcript)
503
        """
504
        if self.sequence is None or not self.contains_start_codon:
4✔
505
            return None
4✔
506
        # pylint: disable=invalid-slice-index
507
        # TODO(tavi) Figure out pylint is not happy with this slice
508
        return self.sequence[: self.first_start_codon_spliced_offset]
4✔
509

510
    @memoized_property
4✔
511
    def three_prime_utr_sequence(self):
4✔
512
        """
513
        cDNA sequence of 3' UTR
514
        (untranslated region at the end of the transcript)
515
        """
516
        if self.sequence is None or not self.contains_stop_codon:
4✔
517
            return None
4✔
518
        return self.sequence[self.last_stop_codon_spliced_offset + 1 :]
4✔
519

520
    @memoized_property
4✔
521
    def transcript_version(self):
4✔
522
        """
523
        Ensembl annotation version for this transcript (int) or None if the
524
        GTF did not provide a transcript_version attribute.
525
        """
526
        if not self.db.column_exists("transcript", "transcript_version"):
4✔
527
            return None
4✔
528
        result = self.db.query_one(
4✔
529
            select_column_names=["transcript_version"],
530
            filter_column="transcript_id",
531
            filter_value=self.id,
532
            feature="transcript",
533
            required=False,
534
        )
535
        if not result or not result[0]:
4✔
536
            return None
×
537
        return int(result[0])
4✔
538

539
    @property
4✔
540
    def version(self):
4✔
541
        """Alias for :attr:`transcript_version`."""
542
        return self.transcript_version
4✔
543

544
    @property
4✔
545
    def versioned_transcript_id(self):
4✔
546
        """``transcript_id.transcript_version`` when available, else ``transcript_id``."""
547
        if self.transcript_version is None:
4✔
548
            return self.transcript_id
4✔
549
        return "%s.%d" % (self.transcript_id, self.transcript_version)
4✔
550

551
    @property
4✔
552
    def versioned_id(self):
4✔
553
        """Alias for :attr:`versioned_transcript_id`."""
554
        return self.versioned_transcript_id
4✔
555

556
    @memoized_property
4✔
557
    def protein_id(self):
4✔
558
        result_tuple = self.db.query_one(
4✔
559
            select_column_names=["protein_id"],
560
            filter_column="transcript_id",
561
            filter_value=self.id,
562
            feature="CDS",
563
            distinct=True,
564
            required=False,
565
        )
566
        if result_tuple:
4✔
567
            return result_tuple[0]
4✔
568
        else:
569
            return None
4✔
570

571
    @memoized_property
4✔
572
    def protein(self):
4✔
573
        """
574
        :class:`Protein` view object for this transcript's encoded protein,
575
        or ``None`` if this transcript is non-coding.
576
        """
577
        from .protein import Protein
4✔
578

579
        if not self.protein_id:
4✔
580
            return None
4✔
581
        protein_version = None
4✔
582
        if self.db.column_exists("CDS", "protein_version"):
4✔
583
            result = self.db.query_one(
4✔
584
                select_column_names=["protein_version"],
585
                filter_column="transcript_id",
586
                filter_value=self.id,
587
                feature="CDS",
588
                distinct=True,
589
                required=False,
590
            )
591
            if result and result[0]:
4✔
592
                protein_version = int(result[0])
4✔
593
        return Protein(
4✔
594
            protein_id=self.protein_id, protein_version=protein_version
595
        )
596

597
    @memoized_property
4✔
598
    def protein_sequence(self):
4✔
599
        if not self.protein_id:
4✔
UNCOV
600
            return None
×
601
        return sequence_lookup_with_ens_fallback(
4✔
602
            self.genome.protein_sequences, self.protein_id
603
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc