• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rmcar17 / cogent3 / 17852834425

19 Sep 2025 08:22AM UTC coverage: 90.681% (+0.009%) from 90.672%
17852834425

push

github

rmcar17
TST: Convert dict views to lists

28257 of 31161 relevant lines covered (90.68%)

5.44 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.51
/src/cogent3/core/sequence.py
1
"""Contains classes that represent biological sequence data.
2

3
Notes
4
-----
5
These should be created via MolType.make_seq()
6
"""
7

8
from __future__ import annotations
6✔
9

10
import contextlib
6✔
11
import json
6✔
12
import re
6✔
13
import warnings
6✔
14
from abc import ABC, abstractmethod
6✔
15
from collections import defaultdict
6✔
16
from functools import total_ordering
6✔
17
from operator import eq, ne
6✔
18
from random import shuffle
6✔
19
from typing import TYPE_CHECKING, Any, SupportsIndex, cast
6✔
20

21
import numba
6✔
22
import numpy
6✔
23
import numpy.typing as npt
6✔
24
from typing_extensions import Self
6✔
25

26
from cogent3._version import __version__
6✔
27
from cogent3.core import alphabet as c3_alphabet
6✔
28
from cogent3.core import genetic_code as c3_genetic_code
6✔
29
from cogent3.core import moltype as c3_moltype
6✔
30
from cogent3.core.annotation import Feature
6✔
31
from cogent3.core.annotation_db import (
6✔
32
    AnnotatableMixin,
33
    FeatureDataType,
34
    SqliteAnnotationDbMixin,
35
    SupportsFeatures,
36
)
37
from cogent3.core.info import Info as InfoClass
6✔
38
from cogent3.core.location import (
6✔
39
    FeatureMap,
40
    IndelMap,
41
    LostSpan,
42
    Span,
43
    Strand,
44
    _input_vals_neg_step,
45
    _input_vals_pos_step,
46
    _LostSpan,
47
)
48
from cogent3.format.fasta import seqs_to_fasta
6✔
49
from cogent3.maths.stats.contingency import CategoryCounts, TestResult
6✔
50
from cogent3.maths.stats.number import CategoryCounter
6✔
51
from cogent3.util.deserialise import register_deserialiser
6✔
52
from cogent3.util.dict_array import DictArray
6✔
53
from cogent3.util.misc import (
6✔
54
    DistanceFromMatrix,
55
    get_object_provenance,
56
    get_setting_from_environ,
57
    is_float,
58
    is_int,
59
)
60
from cogent3.util.transform import for_seq, per_shortest
6✔
61

62
if TYPE_CHECKING:
63
    from collections.abc import Callable, Iterable, Iterator, Mapping
64

65
    from cogent3.core.alignment import Aligned
66
    from cogent3.draw.drawable import Drawable, Shape
67

68
NumpyIntArrayType = npt.NDArray[numpy.integer]
6✔
69

70

71
# standard distance functions: left  because generally useful
72
frac_same = for_seq(f=eq, aggregator=sum, normalizer=per_shortest)
6✔
73
frac_diff = for_seq(f=ne, aggregator=sum, normalizer=per_shortest)
6✔
74

75

76
def _moltype_seq_from_rich_dict(
6✔
77
    data: dict[str, str | bytes | NumpyIntArrayType | dict[str, str]],
78
) -> tuple[c3_moltype.MolType[Any], str | bytes | NumpyIntArrayType]:
79
    """returns moltype and seq and mutates data so it can serve as kwargs to Sequence constructor"""
80
    data.pop("type")
6✔
81
    data.pop("version")
6✔
82
    data.pop("annotation_db", None)
6✔
83
    data.pop("annotation_offset", 0)
6✔
84

85
    moltype: c3_moltype.MolTypeLiteral | c3_moltype.MolType[Any] = cast(
6✔
86
        "c3_moltype.MolTypeLiteral", data.pop("moltype")
87
    )
88
    moltype = c3_moltype.get_moltype(moltype)
6✔
89

90
    seq = cast("str | bytes | NumpyIntArrayType", data.pop("seq"))
6✔
91
    return moltype, seq
6✔
92

93

94
@numba.jit(cache=True, nogil=True)
95
def count_kmers(
96
    seq: npt.NDArray[numpy.uint8],
97
    num_states: int,
98
    k: int,
99
    dtype: npt.DTypeLike = numpy.uint64,
100
) -> NumpyIntArrayType:  # pragma: no cover
101
    """return freqs of valid k-mers using 1D indices
102

103
    Parameters
104
    ----------
105
    seq
106
        numpy array of uint8, assumed that canonical characters have
107
        indexes which are all < num_states
108
    num_states
109
        defines range of possible ints at a position
110
    k
111
        k-mer size
112
    dtype
113
        numpy dtype for the returned array
114
    """
115
    coeffs = c3_alphabet.coord_conversion_coeffs(num_states, k)
116
    kfreqs = numpy.zeros(num_states**k, dtype=dtype)
117
    if len(seq) < k:
118
        return kfreqs
119

120
    skip_until = 0
121
    for i in range(k):
122
        if seq[i] >= num_states:
123
            skip_until = i + 1
124

125
    idx = -1  # -1 means an invalid index
126
    biggest_coeff = coeffs[0]
127

128
    for i in range(len(seq) - k + 1):
129
        gained_char = seq[i + k - 1]
130
        if gained_char >= num_states:
131
            # we reset the kmer index to invalid
132
            # until we get a kmer with only canonical chars
133
            idx = -1
134
            skip_until = i + k
135

136
        if i < skip_until:
137
            continue
138

139
        if idx < 0:
140
            idx = (seq[i : i + k] * coeffs).sum()
141
            kfreqs[idx] += 1
142
            continue
143

144
        dropped_char = seq[i - 1]
145
        idx = (idx - dropped_char * biggest_coeff) * num_states + gained_char
146
        kfreqs[idx] += 1
147

148
    return kfreqs
149

150

151
# This is a design note about the annotation_offset attribute on sequences
152
#  and the related offset attribute on the sequence view.
153

154
# The SeqView object is responsible for performing slices and keeping track of
155
#  those slice operations relative to a parent. This is done via the start, stop
156
#  and step attributes.
157

158
# The annotation_offset attribute is intended to facilitate relating sequences
159
#  to annotations stored in coordinates of an annotated parent sequence. Consider
160
#  for example a gene that lies on a chromosome. If a user has an instance of that
161
#  gene's sequence in a sequence object, then the annotation_offset would be the
162
#  start position of that gene on the plus strand of the original chromosome.
163

164
# It's the case that an annotation_offset can only be specified by the user as
165
#  an argument to the sequence constructor. The design decision was made to
166
#  have the responsibility for providing the coordinates on the parent lie
167
#  with the SeqView class. These values are provided by the parent_start,
168
#  parent_stop and absolute_position methods.
169

170
# The state flow is in the constructor for the sequence class. There is
171
#  an assignment of the annotation_offset to the provided sequence_view.
172

173
# The only time the offset value is not at the SeqView level is the
174
#  sequence object is being serialised.
175

176

177
@total_ordering
6✔
178
class Sequence(AnnotatableMixin):
6✔
179
    """Holds the standard Sequence object. Immutable.
180

181
    Notes
182
    -----
183
    Sequences should be constructed by a MolType instance.
184
    """
185

186
    __slots__ = (
6✔
187
        "_annotation_db",
188
        "_repr_policy",
189
        "_seq",
190
        "info",
191
        "moltype",
192
        "name",
193
    )
194

195
    def __init__(
6✔
196
        self,
197
        moltype: c3_moltype.MolType[Any],
198
        seq: str | bytes | NumpyIntArrayType | SeqViewABC,
199
        *,
200
        name: str | None = None,
201
        info: dict[str, Any] | InfoClass | None = None,
202
        annotation_offset: int = 0,
203
        annotation_db: SupportsFeatures | None = None,
204
    ) -> None:
205
        """Initialize a sequence.
206

207
        Parameters
208
        ----------
209
        moltype
210
            MolType instance
211
        seq
212
            the raw sequence string, default is ''
213
        name
214
            the sequence name
215
        info
216
            Info object or dict
217
        annotation_offset
218
            integer indicating start position relative to annotations
219
        annotation_db
220
            optional annotation database
221

222
        Notes
223
        -----
224
        If a user attempts to add a feature and the annotation_db is None,
225
        a default BasicAnnotationDb instance will be created and used.
226
        """
227
        self.moltype = moltype
6✔
228
        self.name = name
6✔
229
        self._seq = _coerce_to_seqview(
6✔
230
            seq,
231
            name,
232
            self.moltype.most_degen_alphabet(),
233
            annotation_offset,
234
        )
235

236
        self.info = InfoClass(**(info or {}))
6✔
237
        self._repr_policy = {"num_pos": 60}
6✔
238
        self._annotation_db: list[SupportsFeatures] = self._init_annot_db_value(
6✔
239
            annotation_db
240
        )
241

242
    def __str__(self) -> str:
6✔
243
        result = numpy.array(self)
6✔
244
        return self.moltype.most_degen_alphabet().from_indices(result)
6✔
245

246
    def __bytes__(self) -> bytes:
6✔
247
        return str(self).encode("utf8")
6✔
248

249
    def __array__(
6✔
250
        self,
251
        dtype: type[numpy.integer] | None = None,
252
        copy: bool | None = None,
253
    ) -> NumpyIntArrayType:
254
        # using the array_value attribute means we can have
255
        # a aligned or seq data view here and the outcome will be the
256
        # same -- just the sequence is returned
257
        result = self._seq.array_value
6✔
258
        if self._seq.slice_record.is_reversed:
6✔
259
            with contextlib.suppress(TypeError):
6✔
260
                result = self.moltype.complement(result)
6✔
261
        return result
6✔
262

263
    def to_array(self, apply_transforms: bool = True) -> NumpyIntArrayType:
6✔
264
        """returns the numpy array
265

266
        Parameters
267
        ----------
268
        apply_transforms
269
            if True, applies any reverse complement operation
270

271
        Notes
272
        -----
273
        Use this method with apply_transforms=False if you are
274
        creating data for storage in a SeqData instance.
275
        """
276
        if apply_transforms:
6✔
277
            return numpy.array(self)
6✔
278

279
        arr = self._seq.array_value
6✔
280
        if self._seq.is_reversed:
6✔
281
            # the reversal will have been applied in the SeqView
282
            # array_value method, so we undo that here.
283
            arr = arr[::-1]
6✔
284

285
        return arr
6✔
286

287
    def to_fasta(
6✔
288
        self,
289
        make_seqlabel: Callable[[Sequence], str] | None = None,
290
        block_size: int = 60,
291
    ) -> str:
292
        """Return string of self in FASTA format, no trailing newline
293

294
        Parameters
295
        ----------
296
        make_seqlabel
297
            callback function that takes the seq object and
298
            returns a label str
299

300
        """
301

302
        label = "0"
6✔
303

304
        if make_seqlabel is not None:
6✔
305
            label = make_seqlabel(self)
×
306
        elif hasattr(self, "label") and self.label:
6✔
307
            label = self.label
×
308
        elif hasattr(self, "name") and self.name:
6✔
309
            label = self.name
6✔
310
        return seqs_to_fasta({label: str(self)}, block_size=block_size)
6✔
311

312
    def to_phylip(self, name_len: int = 28, label_len: int = 30) -> str:
6✔
313
        """Return string of self in one line for PHYLIP, no newline.
314

315
        Default: max name length is 28, label length is 30.
316
        """
317
        return str(self.name)[:name_len].ljust(label_len) + str(self)
6✔
318

319
    def to_rich_dict(
6✔
320
        self,
321
        exclude_annotations: bool = True,
322
    ) -> dict[str, str | dict[str, str]]:
323
        """returns {'name': name, 'seq': sequence, 'moltype': moltype.label}
324

325
        Notes
326
        -----
327
        Deserialisation of the sequence object will not include the annotation_db
328
        even if exclude_annotations=False.
329
        """
330
        info: InfoClass | dict[str, Any] = {} if self.info is None else self.info
6✔
331
        if info.get("Refs", None) is not None and "Refs" in info:
6✔
332
            info.pop("Refs")
6✔
333

334
        data: dict[str, Any] = {
6✔
335
            "name": self.name,
336
            "seq": str(self),
337
            "moltype": self.moltype.label,
338
            "info": info or None,
339
            "type": get_object_provenance(self),
340
            "version": __version__,
341
        }
342
        if hasattr(self, "annotation_offset"):
6✔
343
            offset = int(self._seq.slice_record.parent_start)
6✔
344
            data |= {"annotation_offset": offset}
6✔
345

346
        if (
6✔
347
            hasattr(self, "annotation_db")
348
            and self.annotation_db
349
            and not exclude_annotations
350
        ):
351
            data["annotation_db"] = self.annotation_db.to_rich_dict()
×
352

353
        return data
6✔
354

355
    @classmethod
6✔
356
    def from_rich_dict(cls, data: dict[str, Any]) -> Sequence:
6✔
357
        """create a Sequence object from a rich dict"""
358
        if isinstance(data["seq"], dict):
6✔
359
            # this is a rich dict from the old type sequences
360
            data["seq"] = data.pop("seq")["init_args"]["seq"]
6✔
361

362
        moltype, seq = _moltype_seq_from_rich_dict(data)
6✔
363

364
        return cls(moltype=moltype, seq=seq, **data)
6✔
365

366
    def to_json(self) -> str:
6✔
367
        """returns a json formatted string"""
368
        return json.dumps(self.to_rich_dict())
6✔
369

370
    def count(self, item: str) -> int:
6✔
371
        """count() delegates to self._seq."""
372
        return str(self).count(item)
×
373

374
    def counts(
6✔
375
        self,
376
        motif_length: int = 1,
377
        include_ambiguity: bool = False,
378
        allow_gap: bool = False,
379
        warn: bool = False,
380
    ) -> CategoryCounter[str | bytes]:
381
        """returns dict of counts of motifs
382

383
        only non-overlapping motifs are counted.
384

385
        Parameters
386
        ----------
387
        motif_length
388
            number of elements per character.
389
        include_ambiguity
390
            if True, motifs containing ambiguous characters
391
            from the seq moltype are included. No expansion of those is attempted.
392
        allow_gaps
393
            if True, motifs containing a gap character are included.
394
        warn
395
            warns if motif_length > 1 and alignment trimmed to produce
396
            motif columns
397
        """
398
        data = numpy.array(self)
6✔
399
        if not len(data):
6✔
400
            return CategoryCounter()
6✔
401

402
        if motif_length != 1:
6✔
403
            if warn and len(data) % motif_length != 0:
6✔
404
                warnings.warn(
×
405
                    f"{self.name} length not divisible by {motif_length}, truncating",
406
                    stacklevel=2,
407
                )
408
            limit = (len(data) // motif_length) * motif_length
6✔
409
            data = data[:limit]
6✔
410
            data = data.reshape(-1, motif_length)
6✔
411

412
        unique_values, counts = numpy.unique(data, axis=0, return_counts=True)
6✔
413
        if motif_length == 1:
6✔
414
            unique_values = unique_values[:, None]
6✔
415

416
        indices = numpy.zeros(unique_values.shape[0], dtype=bool)
6✔
417
        if not allow_gap:
6✔
418
            indices |= numpy.apply_along_axis(
6✔
419
                self.moltype.is_gapped, axis=1, arr=unique_values
420
            )
421

422
        if not include_ambiguity:
6✔
423
            indices |= numpy.apply_along_axis(
6✔
424
                self.moltype.is_degenerate, axis=1, arr=unique_values
425
            )
426

427
        unique_values = unique_values[~indices]
6✔
428
        counts = counts[~indices]
6✔
429

430
        result: dict[str | bytes, int] = {}
6✔
431
        alpha = self.moltype.most_degen_alphabet()
6✔
432
        # The following approach is used because of bytes moltype.
433
        # We can't use the from_indices method on the bytes moltype
434
        # as that creates a string, but individual elements are
435
        # bytes, not all of which can be decoded. Thus we get an
436
        # empty instance and use join.
437
        monomer_type: str | bytes = type(alpha[0])()
6✔
438
        for motif, count in zip(unique_values, counts, strict=True):
6✔
439
            key = monomer_type.join([alpha[i] for i in motif])
6✔
440
            result[key] = int(count)
6✔
441

442
        return CategoryCounter(result)
6✔
443

444
    def count_ambiguous(self) -> int:
6✔
445
        """Returns the number of ambiguous characters in the sequence."""
446
        data = numpy.array(self)
6✔
447
        gap_index = cast("int", self.moltype.most_degen_alphabet().gap_index)
6✔
448
        return int(numpy.sum(data > gap_index))
6✔
449

450
    def count_kmers(self, k: int = 1) -> NumpyIntArrayType:
6✔
451
        """return array of counts of all possible kmers of length k
452

453
        Notes
454
        -----
455
        Only states in moltype.alphabet are allowed in a kmer (the canonical
456
        states). To get the order of kmers as strings, use
457
        self.moltype.alphabet.get_kmer_alphabet(k)
458
        """
459
        seqarray = cast(
6✔
460
            "npt.NDArray[numpy.uint8]",
461
            numpy.array(self),
462
        )
463
        return count_kmers(seqarray, len(self.moltype.alphabet), k)
6✔
464

465
    def __lt__(self, other: Sequence) -> bool:
6✔
466
        """compares based on the sequence string."""
467
        return str(self) < str(other)
×
468

469
    def __eq__(self, other: object) -> bool:
6✔
470
        """compares based on the sequence string."""
471
        return str(self) == str(other)
6✔
472

473
    def __ne__(self, other: object) -> bool:
6✔
474
        """compares based on the sequence string."""
475
        return str(self) != str(other)
6✔
476

477
    def __hash__(self) -> int:
6✔
478
        """__hash__ behaves like the sequence string for dict lookup."""
479
        return hash(str(self))
×
480

481
    def __contains__(self, other: object) -> bool:
6✔
482
        """__contains__ checks whether other is in the sequence string."""
483
        if not isinstance(other, str):
6✔
484
            msg = f"Must use type str for __contains__, got {type(other)}."
×
485
            raise TypeError(msg)
×
486
        return other in str(self)
6✔
487

488
    def shuffle(self) -> Self:
6✔
489
        """returns a randomized copy of the Sequence object"""
490
        randomized_copy_list = list(self)
6✔
491
        shuffle(randomized_copy_list)
6✔
492
        return self.__class__(
6✔
493
            moltype=self.moltype,
494
            seq="".join(randomized_copy_list),
495
            info=self.info,
496
        )
497

498
    def strip_degenerate(self) -> Self:
6✔
499
        """Removes degenerate bases by stripping them out of the sequence."""
500
        return self.__class__(
6✔
501
            moltype=self.moltype,
502
            seq=self.moltype.strip_degenerate(bytes(self)),
503
            info=self.info,
504
        )
505

506
    def strip_bad(self) -> Self:
6✔
507
        """Removes any symbols not in the alphabet."""
508
        return self.__class__(
6✔
509
            moltype=self.moltype,
510
            seq=self.moltype.strip_bad(str(self)),
511
            info=self.info,
512
        )
513

514
    def strip_bad_and_gaps(self) -> Self:
6✔
515
        """Removes any symbols not in the alphabet, and any gaps. As the missing
516
        character could be a gap, this method will remove it as well."""
517
        return self.__class__(
6✔
518
            moltype=self.moltype,
519
            seq=self.moltype.strip_bad_and_gaps(str(self)),
520
            info=self.info,
521
        )
522

523
    def is_gapped(self) -> bool:
6✔
524
        """Returns True if sequence contains gaps."""
525
        return self.moltype.is_gapped(str(self))
6✔
526

527
    def is_degenerate(self) -> bool:
6✔
528
        """Returns True if sequence contains degenerate characters."""
529
        return self.moltype.is_degenerate(str(self))
6✔
530

531
    def is_valid(self) -> bool:
6✔
532
        """Returns True if sequence contains no items absent from alphabet."""
533
        return self.moltype.is_valid(numpy.array(self))
6✔
534

535
    def is_strict(self) -> bool:
6✔
536
        """Returns True if sequence contains only monomers."""
537
        return self.moltype.alphabet.is_valid(numpy.array(self))
6✔
538

539
    def disambiguate(self, method: str = "strip") -> Self:
6✔
540
        """Returns a non-degenerate sequence from a degenerate one.
541

542
        Parameters
543
        ----------
544
        seq
545
            the sequence to be disambiguated
546
        method
547
            how to disambiguate the sequence, one of "strip", "random"
548
            strip: deletes any characters not in monomers or gaps
549
            random: assigns the possibilities at random, using equal frequencies
550
        """
551
        return self.__class__(
6✔
552
            moltype=self.moltype,
553
            seq=self.moltype.disambiguate(str(self), method),
554
            info=self.info,
555
        )
556

557
    def degap(self) -> Self:
6✔
558
        """Deletes all gap characters from sequence."""
559
        result = self.__class__(
6✔
560
            moltype=self.moltype,
561
            seq=self.moltype.degap(bytes(self)),
562
            name=self.name,
563
            info=self.info,
564
        )
565
        result.annotation_db = self.annotation_db
6✔
566
        return result
6✔
567

568
    def gap_indices(self) -> NumpyIntArrayType:
6✔
569
        """Returns array of the indices of all gaps in the sequence"""
570
        return numpy.where(self.gap_vector())[0]
6✔
571

572
    def gap_vector(self) -> list[bool]:
6✔
573
        """Returns vector of True or False according to which pos are gaps or missing."""
574
        degen_gapped_alphabet = cast(
6✔
575
            "c3_alphabet.CharAlphabet[Any]", self.moltype.degen_gapped_alphabet
576
        )
577
        return (
6✔
578
            (numpy.array(self) == degen_gapped_alphabet.gap_index)
579
            | (numpy.array(self) == degen_gapped_alphabet.missing_index)
580
        ).tolist()
581

582
    def count_gaps(self) -> int:
6✔
583
        """Counts the gaps in the specified sequence."""
584
        return self.moltype.count_gaps(bytes(self))
6✔
585

586
    def count_degenerate(self) -> int:
6✔
587
        """Counts the degenerate bases in the specified sequence.
588

589
        Notes
590
        -----
591
        gap and missing characters are counted as degenerate.
592
        """
593
        # design: refactor
594
        # should gap and missing characters be counted as degenerate?
595
        return self.moltype.count_degenerate(bytes(self))
6✔
596

597
    def count_variants(self) -> int:
6✔
598
        """Counts number of possible sequences matching the sequence, given
599
        any ambiguous characters in the sequence.
600

601
        Notes
602
        -----
603
        Uses self.ambiguitues to decide how many possibilities there are at
604
        each position in the sequence and calculates the permutations.
605
        """
606
        return self.moltype.count_variants(str(self))
6✔
607

608
    def mw(self, method: str = "random", delta: float | None = None) -> float:
6✔
609
        """Returns the molecular weight of (one strand of) the sequence.
610

611
        Parameters
612
        ----------
613
        method
614
            If the sequence is ambiguous, uses method (random or strip) to
615
            disambiguate the sequence.
616
        delta
617
            If delta is passed in, adds delta per strand. Default is None, which
618
            uses the alphabet default. Typically, this adds 18 Da for terminal
619
            water. However, note that the default nucleic acid weight assumes
620
            5' monophosphate and 3' OH: pass in delta=18.0 if you want 5' OH as
621
            well.
622

623
        Notes
624
        -----
625
        this method only calculates the MW of the coding strand. If you want
626
        the MW of the reverse strand, add self.rc().mw(). DO NOT just multiply
627
        the MW by 2: the results may not be accurate due to strand bias, e.g.
628
        in mitochondrial genomes.
629
        """
630
        return self.moltype.mw(str(self), method, delta)
6✔
631

632
    def can_match(self, other: Self) -> bool:
6✔
633
        """Returns True if every pos in self could match same pos in other.
634

635
        Truncates at length of shorter sequence.
636
        gaps are only allowed to match other gaps.
637
        """
638
        return self.moltype.can_match(str(self), str(other))
6✔
639

640
    def diff(self, other: Self) -> float:
6✔
641
        """Returns number of differences between self and other.
642

643
        Notes
644
        -----
645
        Truncates at the length of the shorter sequence.
646
        """
647
        return self.distance(other)
6✔
648

649
    def distance(
6✔
650
        self,
651
        other: Self,
652
        function: Callable[[str, str], float] | None = None,
653
    ) -> float:
654
        """Returns distance between self and other using function(i,j).
655

656
        Parameters
657
        ----------
658
        other
659
            a sequence to compare to self
660
        function
661
            takes two seq residues and returns a number. To turn a 2D matrix into
662
            a function, use cogent3.util.miscs.DistanceFromMatrix(matrix).
663

664
        Notes
665
        -----
666
        Truncates at the length of the shorter sequence.
667

668
        The function acts on two _elements_ of the sequences, not the two
669
        sequences themselves (i.e. the behavior will be the same for every
670
        position in the sequences, such as identity scoring or a function
671
        derived from a distance matrix as suggested above). One limitation of
672
        this approach is that the distance function cannot use properties of
673
        the sequences themselves: for example, it cannot use the lengths of the
674
        sequences to normalize the scores as percent similarities or percent
675
        differences.
676

677
        If you want functions that act on the two sequences themselves, there
678
        is no particular advantage in making these functions methods of the
679
        first sequences by passing them in as parameters like the function
680
        in this method. It makes more sense to use them as standalone functions.
681
        The factory function cogent3.util.transform.for_seq is useful for
682
        converting per-element functions into per-sequence functions, since it
683
        takes as parameters a per-element scoring function, a score aggregation
684
        function, and a normalization function (which itself takes the two
685
        sequences as parameters), returning a single function that combines
686
        these functions and that acts on two complete sequences.
687
        """
688
        if function is None:
6✔
689
            # use identity scoring function
690
            def function(a: str, b: str) -> bool:
6✔
691
                return a != b
6✔
692

693
        distance: float = 0
6✔
694
        for first, second in zip(self, other, strict=False):
6✔
695
            distance += function(first, second)
6✔
696
        return distance
6✔
697

698
    def matrix_distance(
6✔
699
        self,
700
        other: Self,
701
        matrix: Mapping[str, Mapping[str, float]],
702
    ) -> float:
703
        """Returns distance between self and other using a score matrix.
704

705
        Warnings
706
        --------
707
        The matrix must explicitly contain scores for the case where
708
        a position is the same in self and other (e.g. for a distance matrix,
709
        an identity between U and U might have a score of 0). The reason the
710
        scores for the 'diagonals' need to be passed explicitly is that for
711
        some kinds of distance matrices, e.g. log-odds matrices, the 'diagonal'
712
        scores differ from each other. If these elements are missing, this
713
        function will raise a KeyError at the first position that the two
714
        sequences are identical.
715
        """
716
        return self.distance(other, DistanceFromMatrix(matrix))
6✔
717

718
    def frac_same(self, other: Self) -> float:
6✔
719
        """Returns fraction of positions where self and other are the same.
720

721
        Notes
722
        -----
723
        Truncates at length of shorter sequence. Will return  0 if one sequence
724
        is empty.
725
        """
726
        return frac_same(self, other)
6✔
727

728
    def frac_diff(self, other: Self) -> float:
6✔
729
        """Returns fraction of positions where self and other differ.
730

731
        Notes
732
        -----
733
        Truncates at length of shorter sequence. Will return  0 if one sequence
734
        is empty.
735
        """
736
        return frac_diff(self, other)
6✔
737

738
    def frac_same_gaps(self, other: Self) -> float:
6✔
739
        """Returns fraction of positions where self and other share gap states.
740

741
        In other words, if self and other are both all gaps, or both all
742
        non-gaps, or both have gaps in the same places, frac_same_gaps will
743
        return 1.0. If self is all gaps and other has no gaps, frac_same_gaps
744
        will return 0.0. Returns 0 if one sequence is empty.
745

746
        Uses self's gap characters for both sequences.
747
        """
748
        if not self or not other:
6✔
749
            return 0.0
6✔
750

751
        is_gap = self.moltype.gaps.__contains__
6✔
752
        return sum(
6✔
753
            [is_gap(i) == is_gap(j) for i, j in zip(self, other, strict=False)],
754
        ) / min(
755
            len(self),
756
            len(other),
757
        )
758

759
    def frac_diff_gaps(self, other: Self) -> float:
6✔
760
        """Returns frac. of positions where self and other's gap states differ.
761

762
        In other words, if self and other are both all gaps, or both all
763
        non-gaps, or both have gaps in the same places, frac_diff_gaps will
764
        return 0.0. If self is all gaps and other has no gaps, frac_diff_gaps
765
        will return 1.0.
766

767
        Returns 0 if one sequence is empty.
768

769
        Uses self's gap characters for both sequences.
770
        """
771
        if not self or not other:
6✔
772
            return 0.0
6✔
773
        return 1.0 - self.frac_same_gaps(other)
6✔
774

775
    def frac_same_non_gaps(self, other: Self) -> float:
6✔
776
        """Returns fraction of non-gap positions where self matches other.
777

778
        Doesn't count any position where self or other has a gap.
779
        Truncates at the length of the shorter sequence.
780

781
        Returns 0 if one sequence is empty.
782
        """
783
        # refactor: simplify
784
        # refactor: array - make use of self._seq.array_value
785
        if not self or not other:
6✔
786
            return 0.0
6✔
787

788
        is_gap = self.moltype.gaps.__contains__
6✔
789
        count = 0
6✔
790
        identities = 0
6✔
791
        for i, j in zip(self, other, strict=False):
6✔
792
            if is_gap(i) or is_gap(j):
6✔
793
                continue
6✔
794
            count += 1
6✔
795
            if i == j:
6✔
796
                identities += 1
6✔
797

798
        if count:
6✔
799
            return identities / count
6✔
800
        # there were no positions that weren't gaps
801
        return 0
6✔
802

803
    def frac_diff_non_gaps(self, other: Self) -> float:
6✔
804
        """Returns fraction of non-gap positions where self differs from other.
805

806
        Doesn't count any position where self or other has a gap.
807
        Truncates at the length of the shorter sequence.
808

809
        Returns 0 if one sequence is empty. Note that this means that
810
        frac_diff_non_gaps is _not_ the same as 1 - frac_same_non_gaps, since both
811
        return 0 if one sequence is empty.
812
        """
813
        if not self or not other:
6✔
814
            return 0.0
6✔
815

816
        is_gap = self.moltype.gaps.__contains__
6✔
817
        count = 0
6✔
818
        diffs = 0
6✔
819
        for i, j in zip(self, other, strict=False):
6✔
820
            if is_gap(i) or is_gap(j):
6✔
821
                continue
6✔
822
            count += 1
6✔
823
            if i != j:
6✔
824
                diffs += 1
6✔
825

826
        if count:
6✔
827
            return diffs / count
6✔
828
        # there were no positions that weren't gaps
829
        return 0
6✔
830

831
    def frac_similar(
6✔
832
        self,
833
        other: Self,
834
        similar_pairs: dict[tuple[str, str], Any],
835
    ) -> float:
836
        """Returns fraction of positions where self[i] is similar to other[i].
837

838
        similar_pairs must be a dict such that d[(i,j)] exists if i and j are
839
        to be counted as similar. Use PairsFromGroups in cogent3.util.misc to
840
        construct such a dict from a list of lists of similar residues.
841

842
        Truncates at the length of the shorter sequence.
843

844
        Note: current implementation re-creates the distance function each
845
        time, so may be expensive compared to creating the distance function
846
        using for_seq separately.
847

848
        Returns 0 if one sequence is empty.
849
        """
850
        if not self or not other:
6✔
851
            return 0.0
6✔
852

853
        def is_in_similar_pairs(x: str, y: str) -> bool:
6✔
854
            return (x, y) in similar_pairs
6✔
855

856
        return for_seq(f=is_in_similar_pairs, normalizer=per_shortest)(
6✔
857
            self,
858
            other,
859
        )
860

861
    def with_termini_unknown(self) -> Self:
6✔
862
        """Returns copy of sequence with terminal gaps remapped as missing."""
863
        gaps = self.gap_vector()
6✔
864
        first_nongap = last_nongap = None
6✔
865
        for i, state in enumerate(gaps):
6✔
866
            if not state:
6✔
867
                if first_nongap is None:
6✔
868
                    first_nongap = i
6✔
869
                last_nongap = i
6✔
870
        missing = cast("str", self.moltype.missing)
6✔
871
        if first_nongap is None:
6✔
872
            return self.__class__(
6✔
873
                moltype=self.moltype,
874
                seq=missing * len(self),
875
                info=self.info,
876
            )
877
        last_nongap = cast("int", last_nongap)
6✔
878

879
        prefix = missing * first_nongap
6✔
880
        mid = str(self)[first_nongap : last_nongap + 1]
6✔
881
        suffix = missing * (len(self) - last_nongap - 1)
6✔
882
        return self.__class__(
6✔
883
            moltype=self.moltype,
884
            seq=prefix + mid + suffix,
885
            info=self.info,
886
        )
887

888
    def _repr_html_(self) -> str:
6✔
889
        settings = self._repr_policy.copy()
6✔
890
        env_vals = get_setting_from_environ(
6✔
891
            "COGENT3_ALIGNMENT_REPR_POLICY",
892
            {"num_pos": int},
893
        )
894
        settings.update(env_vals)
6✔
895
        return self.to_html(limit=settings["num_pos"])
6✔
896

897
    def to_html(
6✔
898
        self,
899
        wrap: int = 60,
900
        limit: int | None = None,
901
        colors: Mapping[str, str] | None = None,
902
        font_size: int = 12,
903
        font_family: str = "Lucida Console",
904
    ) -> str:
905
        """returns html with embedded styles for sequence colouring
906

907
        Parameters
908
        ----------
909
        wrap
910
            maximum number of printed bases, defaults to
911
            alignment length
912
        limit
913
            truncate alignment to this length
914
        colors
915
            dict of {char: color} to use for coloring
916
        font_size
917
            in points. Affects labels and sequence and line spacing
918
            (proportional to value)
919
        font_family
920
            string denoting font family
921

922
        To display in jupyter notebook:
923

924
            >>> from IPython.core.display import HTML
925
            >>> HTML(aln.to_html())
926
        """
927
        css, styles = self.moltype.get_css_style(
6✔
928
            colors=colors,
929
            font_size=font_size,
930
            font_family=font_family,
931
        )
932

933
        seq = str(self)
6✔
934
        seq = seq if limit is None else seq[:limit]
6✔
935
        seqlen = len(seq)
6✔
936
        if gaps := self.moltype.gaps:
6✔
937
            non_hyphen = "".join(gaps - {"-"})
6✔
938
            # make sure hyphen at end of negated character group
939
            # so it's not interpreted as a character range
940
            chars = f"{non_hyphen}-" if "-" in gaps else non_hyphen
6✔
941
            # looking for gaps at the the start of the seq
942
            start_gap = re.search(f"^[{chars}]+", "".join(seq))
6✔
943
            # looking for gaps at the end of the seq
944
            end_gap = re.search(f"[{gaps}]+$", "".join(seq))
6✔
945

946
            start = 0 if start_gap is None else start_gap.end()
6✔
947
            end = len(seq) if end_gap is None else end_gap.start()
6✔
948
        else:
949
            start = 0
6✔
950
            end = len(seq)
6✔
951

952
        seq_style: list[str] = []
6✔
953
        template = '<span class="%s">%%s</span>'
6✔
954
        styled_seq: list[str] = []
6✔
955
        for i in range(seqlen):
6✔
956
            char = seq[i]
6✔
957
            if i < start or i >= end:
6✔
958
                style = f"terminal_ambig_{self.moltype.label}"
6✔
959
            else:
960
                style = styles[char]
6✔
961

962
            seq_style.append(template % style)
6✔
963
            styled_seq.append(seq_style[-1] % char)
6✔
964

965
        # make a html table
966
        seq_array = numpy.array(styled_seq, dtype="O")
6✔
967
        table = ["<table>"]
6✔
968
        seq_ = "<td>%s</td>"
6✔
969
        label_ = '<td class="label">%s</td>'
6✔
970
        num_row_ = '<tr class="num_row"><td></td><td><b>{:,d}</b></td></tr>'
6✔
971
        for i in range(0, seqlen, wrap):
6✔
972
            table.append(num_row_.format(i))
6✔
973
            seqblock = seq_array[i : i + wrap].tolist()
6✔
974
            seqblock = "".join(seqblock)
6✔
975
            row = "".join([label_ % self.name, seq_ % seqblock])
6✔
976
            table.append(f"<tr>{row}</tr>")
6✔
977
        table.append("</table>")
6✔
978
        class_name = self.__class__.__name__
6✔
979
        if limit and limit < len(self):
6✔
980
            summary = f"{class_name}, length={len(self):,} (truncated to {limit if limit else len(self)})"
6✔
981
        else:
982
            summary = f"{class_name}, length={len(self):,}"
6✔
983

984
        text = [
6✔
985
            "<style>",
986
            ".c3seq table {margin: 10px 0;}",
987
            ".c3seq td { border: none !important; text-align: left !important; }",
988
            ".c3seq tr:not(.num_row) td span {margin: 0 2px;}",
989
            ".c3seq tr:nth-child(even) {background: #f7f7f7;}",
990
            ".c3seq .num_row {background-color:rgba(161, 195, 209, 0.5) !important; border-top: solid 1px black; }",
991
            f".c3seq .label {{ font-size: {font_size}pt ; text-align: right !important; "
992
            "color: black !important; padding: 0 4px; }}",
993
            "\n".join([f".c3seq {style}" for style in css]),
994
            "</style>",
995
            '<div class="c3seq">',
996
            "\n".join(table),
997
            f"<p><i>{summary}</i></p>",
998
            "</div>",
999
        ]
1000
        return "\n".join(text)
6✔
1001

1002
    def __add__(self, other: Self) -> Self:
6✔
1003
        """Adds two sequences (other can be a string as well)."""
1004
        if hasattr(other, "moltype") and self.moltype != other.moltype:
6✔
1005
            msg = f"MolTypes don't match: ({self.moltype},{other.moltype})"
×
1006
            raise ValueError(
×
1007
                msg,
1008
            )
1009
        other_seq = str(other)
6✔
1010

1011
        if not self.moltype.most_degen_alphabet().is_valid(str(other)):
6✔
1012
            msg = (
6✔
1013
                f"Invalid sequence characters in other for moltype={self.moltype.label}"
1014
            )
1015
            raise c3_alphabet.AlphabetError(
6✔
1016
                msg,
1017
            )
1018

1019
        # If two sequences with the same name are being added together the name should not be None
1020
        if type(other) is type(self):
6✔
1021
            name = self.name if self.name == other.name else None
6✔
1022
        else:
1023
            name = None
6✔
1024

1025
        return self.__class__(
6✔
1026
            moltype=self.moltype,
1027
            seq=str(self) + other_seq,
1028
            name=name,
1029
        )
1030

1031
    @property
6✔
1032
    def annotation_offset(self) -> int:
6✔
1033
        """
1034
        The offset between annotation coordinates and sequence coordinates.
1035

1036
        The offset can be used to adjust annotation coordinates to match the position
1037
        of the given Sequence within a larger genomic context. For example, if the
1038
        Annotations are with respect to a chromosome, and the sequence represents
1039
        a gene that is 100 bp from the start of a chromosome, the offset can be set to
1040
        100 to ensure that the gene's annotations are aligned with the appropriate
1041
        genomic positions.
1042

1043

1044
        Returns:
1045
            int: The offset between annotation coordinates and sequence coordinates.
1046
        """
1047
        parent_offset = self._seq.parent_offset
6✔
1048
        slice_start = self._seq.slice_record.parent_start
6✔
1049
        return parent_offset + slice_start
6✔
1050

1051
    def get_features(
6✔
1052
        self,
1053
        *,
1054
        biotype: str | None = None,
1055
        name: str | None = None,
1056
        start: int | None = None,
1057
        stop: int | None = None,
1058
        allow_partial: bool = False,
1059
        **kwargs: Any,
1060
    ) -> Iterator[Feature[Sequence]]:
1061
        """yields Feature instances
1062

1063
        Parameters
1064
        ----------
1065
        biotype
1066
            biotype of the feature
1067
        name
1068
            name of the feature
1069
        start, stop
1070
            start, stop positions to search between, relative to offset
1071
            of this sequence. If not provided, entire span of sequence is used.
1072
        kwargs
1073
            keyword arguments passed to annotation_db.get_features_matching()
1074

1075
        Notes
1076
        -----
1077
        When dealing with a nucleic acid moltype, the returned features will
1078
        yield a sequence segment that is consistently oriented irrespective
1079
        of strand of the current instance.
1080
        """
1081

1082
        if self._annotation_db is None:
6✔
1083
            return None
×
1084

1085
        start = start or 0
6✔
1086
        stop = stop or len(self)
6✔
1087

1088
        # handle negative start / stop
1089
        start = start + len(self) if start < 0 else start
6✔
1090
        stop = stop + len(self) if stop < 0 else stop
6✔
1091

1092
        start, stop = (start, stop) if start < stop else (stop, start)
6✔
1093
        stop = min(
6✔
1094
            len(self), stop
1095
        )  # otherwise index error from absolute_position method
1096

1097
        # we set include_boundary=False because start is inclusive indexing,
1098
        # i,e., the start cannot be equal to the length of the view
1099
        (
6✔
1100
            # parent_id can differ from self.name if there was a
1101
            # rename operation
1102
            parent_id,
1103
            *_,
1104
            _,
1105
        ) = self.parent_coordinates()
1106
        parent_offset = self._seq.parent_offset
6✔
1107
        sr = self._seq.slice_record
6✔
1108
        query_start = (
6✔
1109
            sr.absolute_position(
1110
                start,
1111
                include_boundary=False,
1112
            )
1113
            + parent_offset
1114
        )
1115
        # we set include_boundary=True because stop is exclusive indexing,
1116
        # i,e., the stop can be equal to the length of the view
1117
        query_stop = (
6✔
1118
            sr.absolute_position(
1119
                stop,
1120
                include_boundary=True,
1121
            )
1122
            + parent_offset
1123
        )
1124

1125
        query_start, query_stop = (
6✔
1126
            min(query_stop, query_start),
1127
            max(query_stop, query_start),
1128
        )
1129
        query_start = max(query_start, 0)
6✔
1130
        # in the underlying db, features are always plus strand oriented
1131
        # (need to check that for user defined features)
1132
        # a '-' strand feature in the db may be [(2, 4), (6, 8)]
1133
        # so we would take 7-6, 3-2 and complement it
1134
        # if the current sequence is reverse complemented
1135
        # then a '+' feature from the db needs to be nucleic reversed
1136
        # and a '-' feature from the db needs to be nucleic reversed too
1137

1138
        # making this more complicated is self.make_feature() assumes
1139
        # all coordinates are with respect to the current orientation
1140
        # so self.make_feature(data(spans=[(2,4)])) has different meaning if
1141
        # self is rc'ed, it would correspond to len(self)-2, etc...
1142
        # To piggy-back on that method we need to convert our feature spans
1143
        # into the current orientation. HOWEVER, we also have the reversed
1144
        # flag which comes back from the db
1145
        kwargs |= {"allow_partial": allow_partial}
6✔
1146
        for feature in self.annotation_db.get_features_matching(
6✔
1147
            seqid=parent_id,
1148
            name=name,
1149
            biotype=biotype,
1150
            start=query_start,
1151
            stop=query_stop,
1152
            **kwargs,
1153
        ):
1154
            # spans need to be converted from absolute to relative positions
1155
            # DO NOT do adjustment in make_feature since that's user facing,
1156
            # and we expect them to make a feature manually wrt to their
1157
            # current view
1158
            spans = numpy.array(feature["spans"], dtype=int)
6✔
1159
            for i, v in enumerate(spans.ravel()):
6✔
1160
                rel_pos = sr.relative_position(v) - parent_offset
6✔
1161
                spans.ravel()[i] = rel_pos
6✔
1162

1163
            if sr.is_reversed:
6✔
1164
                # see above comment
1165
                spans = len(self) - spans
6✔
1166

1167
            feature["spans"] = spans.tolist()
6✔
1168
            yield self.make_feature(feature)
6✔
1169

1170
    def make_feature(self, feature: FeatureDataType, *args: Any) -> Feature[Sequence]:
6✔
1171
        """
1172
        return an Feature instance from feature data
1173

1174
        Parameters
1175
        ----------
1176
        feature
1177
            dict of key data to make an Feature instance
1178

1179
        Notes
1180
        -----
1181
        Unlike add_feature(), this method does not add the feature to the
1182
        database.
1183
        We assume that spans represent the coordinates for this instance!
1184
        """
1185
        feature_dict: dict[str, Any] = dict(feature)
6✔
1186
        seq_rced = self._seq.is_reversed
6✔
1187
        spans = feature_dict.pop("spans", None)
6✔
1188
        revd = Strand.from_value(feature_dict.pop("strand", None)) is Strand.MINUS
6✔
1189
        feature_dict["strand"] = (
6✔
1190
            Strand.PLUS.value if revd == seq_rced else Strand.MINUS.value
1191
        )
1192

1193
        vals = numpy.array(spans)
6✔
1194
        pre = abs(vals.min()) if vals.min() < 0 else 0
6✔
1195
        post = abs(vals.max() - len(self)) if vals.max() > len(self) else 0
6✔
1196

1197
        # we find the spans > len(self)
1198
        new_spans: list[tuple[int, int]] = []
6✔
1199
        for coord in vals:
6✔
1200
            new = coord[:]
6✔
1201
            if coord.min() < 0 < coord.max():
6✔
1202
                new = coord[:]
6✔
1203
                new[new < 0] = 0
6✔
1204
            elif coord.min() < len(self) < coord.max():
6✔
1205
                new[new > len(self)] = len(self)
6✔
1206
            elif coord[0] == coord[1] or coord.min() > len(self) or coord.max() < 0:
6✔
1207
                # would really to adjust pre and post to be up to the next span
1208
                continue
6✔
1209
            new_spans.append(new.tolist())
6✔
1210

1211
        fmap = FeatureMap.from_locations(locations=new_spans, parent_length=len(self))
6✔
1212
        if pre or post:
6✔
1213
            # create a lost span to represent the segment missing from
1214
            # the instance
1215
            spans = list(fmap.iter_spans())
6✔
1216
            if pre:
6✔
1217
                spans.insert(0, LostSpan(pre))
6✔
1218
            if post:
6✔
1219
                spans.append(LostSpan(post))
6✔
1220
            fmap = FeatureMap(spans=spans, parent_length=len(self))
6✔
1221

1222
        if seq_rced:
6✔
1223
            fmap = fmap.nucleic_reversed()
6✔
1224

1225
        feature_dict.pop("on_alignment", None)
6✔
1226
        feature_dict.pop("seqid", None)
6✔
1227
        return Feature(
6✔
1228
            parent=self, seqid=cast("str", self.name), map=fmap, **feature_dict
1229
        )
1230

1231
    def add_feature(
6✔
1232
        self,
1233
        *,
1234
        biotype: str,
1235
        name: str,
1236
        spans: list[tuple[int, int]],
1237
        parent_id: str | None = None,
1238
        strand: str | None = None,
1239
        on_alignment: bool = False,
1240
        seqid: str | None = None,
1241
    ) -> Feature[Sequence]:
1242
        """
1243
        add a feature to annotation_db
1244

1245
        Parameters
1246
        ----------
1247
        biotype
1248
            biological type
1249
        name
1250
            name of the feature
1251
        spans
1252
            coordinates for this sequence
1253
        parent_id
1254
            name of the feature parent
1255
        strand
1256
            '+' or '-', defaults to '+'
1257
        on_alignment
1258
            whether the feature spans are alignment coordinates
1259
        seqid
1260
            ignored since the feature is added to this sequence
1261

1262
        Returns
1263
        -------
1264
        Feature instance
1265
        """
1266
        local_vars = locals()
6✔
1267
        local_vars["strand"] = Strand.from_value(strand).value
6✔
1268

1269
        feature_data = FeatureDataType(
6✔
1270
            seqid=cast("str", self.name),
1271
            **{n: v for n, v in local_vars.items() if n not in ("self", "seqid")},  # type: ignore[typeddict-item]
1272
        )
1273

1274
        self.annotation_db.add_feature(**cast("dict[str, Any]", feature_data))
6✔
1275
        for discard in ("on_alignment", "parent_id"):
6✔
1276
            feature_data.pop(discard)
6✔
1277
        return self.make_feature(feature_data)
6✔
1278

1279
    def to_moltype(
6✔
1280
        self, moltype: c3_moltype.MolTypeLiteral | c3_moltype.MolType[Any]
1281
    ) -> Sequence:
1282
        """returns copy of self with moltype seq
1283

1284
        Parameters
1285
        ----------
1286
        moltype
1287
            molecular type
1288

1289
        Notes
1290
        -----
1291
        This method cannot convert between nucleic acids and proteins. Use
1292
        get_translation() for that.
1293

1294
        When applied to a sequence in a SequenceCollection, the resulting
1295
        sequence will no longer be part of the collection.
1296
        """
1297

1298
        if not moltype:
6✔
1299
            msg = f"unknown moltype '{moltype}'"
6✔
1300
            raise ValueError(msg)
6✔
1301

1302
        moltype = c3_moltype.get_moltype(moltype)
6✔
1303

1304
        if moltype is self.moltype:
6✔
1305
            return self
6✔
1306

1307
        seq = numpy.array(self)
6✔
1308
        self_alpha = self.moltype.most_degen_alphabet()
6✔
1309
        other_alpha = moltype.most_degen_alphabet()
6✔
1310
        if len(self.moltype.alphabet) != len(moltype.alphabet):
6✔
1311
            # use alphabet converter
1312
            seq = self_alpha.convert_seq_array_to(
6✔
1313
                seq=seq,
1314
                alphabet=other_alpha,
1315
                check_valid=False,
1316
            )
1317

1318
        if not other_alpha.is_valid(seq):
6✔
1319
            msg = (
6✔
1320
                f"Changing from old moltype={self.moltype.label!r} to new "
1321
                f"moltype={moltype.label!r} is not valid for this data"
1322
            )
1323
            raise c3_moltype.MolTypeError(
6✔
1324
                msg,
1325
            )
1326
        sv = SeqView(
6✔
1327
            parent=seq,
1328
            parent_len=len(seq),
1329
            alphabet=moltype.most_degen_alphabet(),
1330
        )
1331
        return moltype.make_sequence(
6✔
1332
            seq=sv,
1333
            name=self.name,
1334
            check_seq=False,
1335
            info=self.info,
1336
            annotation_db=self.annotation_db,
1337
        )
1338

1339
    def copy_annotations(self, seq_db: SqliteAnnotationDbMixin) -> None:
6✔
1340
        """copy annotations into attached annotation db
1341

1342
        Parameters
1343
        ----------
1344
        seq_db
1345
            compatible annotation db
1346

1347
        Notes
1348
        -----
1349
        Only copies annotations for records with seqid equal to self.name
1350
        """
1351
        if not isinstance(seq_db, SupportsFeatures):
6✔
1352
            msg = f"type {type(seq_db)} does not match SupportsFeatures interface"
×
1353
            raise TypeError(
×
1354
                msg,
1355
            )
1356

1357
        if not seq_db.num_matches(seqid=self.name):
6✔
1358
            return
×
1359

1360
        if self._annotation_db and not self.annotation_db.compatible(seq_db):
6✔
1361
            msg = f"type {type(seq_db)} != {type(self.annotation_db)}"
×
1362
            raise TypeError(msg)
×
1363

1364
        self.annotation_db.update(seq_db, seqids=self.name)
6✔
1365

1366
    def copy(
6✔
1367
        self,
1368
        exclude_annotations: bool = False,
1369
        sliced: bool = True,
1370
    ) -> Self:
1371
        """returns a copy of self
1372

1373
        Parameters
1374
        ----------
1375
        sliced
1376
            Slices underlying sequence with start/end of self coordinates. The offset
1377
            is retained.
1378
        exclude_annotations
1379
            drops annotation_db when True
1380
        """
1381
        # when slicing a seqview with sliced=True, seqview discards the
1382
        # offset attribute. Sequence then needs to be provided to its
1383
        # constructor from its current state.
1384
        offset = self.annotation_offset if sliced else 0
6✔
1385
        data = self._seq.copy(sliced=sliced)
6✔
1386
        return self.__class__(
6✔
1387
            moltype=self.moltype,
1388
            seq=data,
1389
            name=self.name,
1390
            info=self.info,
1391
            annotation_offset=offset,
1392
            annotation_db=None if exclude_annotations else self.annotation_db,
1393
        )
1394

1395
    def with_masked_annotations(
6✔
1396
        self,
1397
        biotypes: str | Iterable[str],
1398
        mask_char: str | None = None,
1399
        shadow: bool = False,
1400
    ) -> Self:
1401
        """returns a sequence with annot_types regions replaced by mask_char
1402
        if shadow is False, otherwise all other regions are masked.
1403

1404
        Parameters
1405
        ----------
1406
        annot_types
1407
            annotation type(s)
1408
        mask_char
1409
            must be a character valid for the seq MolType. The
1410
            default value is the most ambiguous character, eg. '?' for DNA
1411
        shadow
1412
            whether to mask the annotated regions, or everything but
1413
            the annotated regions
1414
        """
1415
        if mask_char is None:
6✔
1416
            assert self.moltype.ambiguities is not None
6✔
1417
            mask_char = (
6✔
1418
                self.moltype.missing
1419
                or max(self.moltype.ambiguities.items(), key=lambda x: len(x[1]))[0]
1420
            )
1421
        assert mask_char in self.moltype.most_degen_alphabet(), (
6✔
1422
            f"Invalid mask_char {mask_char}"
1423
        )
1424

1425
        annotations: list[Feature[Sequence]] = []
6✔
1426
        biotypes = [biotypes] if isinstance(biotypes, str) else biotypes
6✔
1427
        for annot_type in biotypes:
6✔
1428
            annotations += list(
6✔
1429
                self.get_features(biotype=annot_type, allow_partial=True),
1430
            )
1431
        if not annotations:
6✔
1432
            return self
6✔
1433

1434
        region = annotations[0].union(annotations[1:])
6✔
1435

1436
        if shadow:
6✔
1437
            region = region.shadow()
6✔
1438

1439
        i = 0
6✔
1440
        segments: list[str] = []
6✔
1441
        coords = region.map.get_coordinates()
6✔
1442
        for b, e in coords:
6✔
1443
            segments.extend((str(self[i:b]), mask_char * (e - b)))
6✔
1444
            i = e
6✔
1445
        segments.append(str(self[i:]))
6✔
1446

1447
        new = self.__class__(
6✔
1448
            moltype=self.moltype,
1449
            seq="".join(segments),
1450
            name=self.name,
1451
            info=self.info,
1452
        )
1453
        new.annotation_db = self.annotation_db
6✔
1454
        return new
6✔
1455

1456
    def gapped_by_map_segment_iter(
6✔
1457
        self,
1458
        segment_map: IndelMap | FeatureMap,
1459
        allow_gaps: bool = True,
1460
        recode_gaps: bool = False,
1461
    ) -> Iterator[str]:
1462
        if not allow_gaps and not segment_map.complete:
6✔
1463
            msg = f"gap(s) in map {segment_map}"
6✔
1464
            raise ValueError(msg)
6✔
1465

1466
        for span in segment_map.iter_spans():
6✔
1467
            if span.lost:
6✔
1468
                span = cast("_LostSpan", span)
6✔
1469
                unknown = "?" if span.terminal or recode_gaps else "-"
6✔
1470
                seg = unknown * span.length
6✔
1471
            else:
1472
                span = cast("Span", span)
6✔
1473
                seg = str(self[span.start : span.end])
6✔
1474

1475
            yield seg
6✔
1476

1477
    def gapped_by_map_motif_iter(
6✔
1478
        self,
1479
        segment_map: IndelMap,
1480
    ) -> Iterator[str]:
1481
        for segment in self.gapped_by_map_segment_iter(segment_map):
×
1482
            yield from segment
×
1483

1484
    def gapped_by_map(
6✔
1485
        self,
1486
        segment_map: IndelMap,
1487
        recode_gaps: bool = False,
1488
    ) -> Self:
1489
        segments = self.gapped_by_map_segment_iter(segment_map, True, recode_gaps)
6✔
1490
        return self.__class__(
6✔
1491
            moltype=self.moltype,
1492
            seq="".join(segments),
1493
            name=self.name,
1494
            info=self.info,
1495
        )
1496

1497
    def _mapped(self, segment_map: FeatureMap) -> Self:
6✔
1498
        # Called by generic __getitem__
1499
        seq: SeqViewABC | str
1500
        if segment_map.num_spans == 1:
6✔
1501
            seq = self._seq[segment_map.start : segment_map.end]
6✔
1502
        else:
1503
            segments = self.gapped_by_map_segment_iter(segment_map, allow_gaps=False)
6✔
1504
            seq = "".join(segments)
6✔
1505

1506
        return self.__class__(
6✔
1507
            moltype=self.moltype,
1508
            seq=seq,
1509
            name=self.name,
1510
            info=self.info,
1511
        )
1512

1513
    def __repr__(self) -> str:
6✔
1514
        myclass = f"{self.__class__.__name__}"
6✔
1515
        myclass = myclass.split(".")[-1]
6✔
1516
        seq = f"{str(self)[:7]}... {len(self):,}" if len(self) > 10 else str(self)
6✔
1517
        return f"{myclass}({seq})"
6✔
1518

1519
    def __getitem__(
6✔
1520
        self, index: Feature[Sequence] | FeatureMap | slice | SupportsIndex
1521
    ) -> Self:
1522
        preserve_offset = False
6✔
1523
        if isinstance(index, Feature):
6✔
1524
            if index.parent is not self:
6✔
1525
                msg = "cannot slice Feature not bound to self"
×
1526
                raise ValueError(msg)
×
1527
            return cast("Self", index.get_slice())
6✔
1528

1529
        if isinstance(index, FeatureMap):
6✔
1530
            new = self._mapped(index)
6✔
1531
            # annotations have no meaning if disjoint slicing segments
1532
            preserve_offset = index.num_spans == 1
6✔
1533
        elif isinstance(index, slice) or is_int(index):
6✔
1534
            new = self.__class__(
6✔
1535
                moltype=self.moltype,
1536
                seq=self._seq[index],
1537
                name=self.name,
1538
                info=self.info,
1539
            )
1540
            stride = getattr(index, "step", 1) or 1
6✔
1541
            preserve_offset = stride > 0
6✔
1542
        else:
1543
            msg = f"Cannot slice using a {type(index)}"
6✔
1544
            raise TypeError(msg)
6✔
1545

1546
        if isinstance(index, list | tuple):
6✔
1547
            msg = "cannot slice using list or tuple"
×
1548
            raise TypeError(msg)
×
1549

1550
        if self._annotation_db and preserve_offset:
6✔
1551
            new.replace_annotation_db(self.annotation_db, check=False)
6✔
1552

1553
        if is_float(index):
6✔
1554
            msg = "cannot slice using float"
×
1555
            raise TypeError(msg)
×
1556

1557
        if hasattr(self, "_repr_policy"):
6✔
1558
            new._repr_policy.update(self._repr_policy)
6✔
1559

1560
        return new
6✔
1561

1562
    def __iter__(self) -> Iterator[str]:
6✔
1563
        yield from iter(str(self))
6✔
1564

1565
    def get_name(self) -> str | None:
6✔
1566
        """Return the sequence name -- should just use name instead."""
1567
        return self.name
6✔
1568

1569
    def __len__(self) -> int:
6✔
1570
        return len(self._seq)
6✔
1571

1572
    def get_type(self) -> str:
6✔
1573
        """Return the sequence type as moltype label."""
1574
        return self.moltype.label
6✔
1575

1576
    def resolved_ambiguities(self) -> list[set[str]]:
6✔
1577
        """Returns a list of sets of strings."""
1578
        ambigs = cast("dict[str, frozenset[str]]", self.moltype.ambiguities)
6✔
1579
        return [set(ambigs.get(motif, motif)) for motif in str(self)]
6✔
1580

1581
    def iter_kmers(self, k: int, strict: bool = True) -> Iterator[str]:
6✔
1582
        """generates all overlapping k-mers.
1583
        When strict is True, the characters in the k-mer must be
1584
        a subset of the canonical characters for the moltype"""
1585
        if k <= 0:
6✔
1586
            msg = f"k must be an int > 0, not {k}"
6✔
1587
            raise ValueError(msg)
6✔
1588

1589
        if not isinstance(k, int):
6✔
1590
            msg = f"k must be an int, not {k}"
6✔
1591
            raise ValueError(msg)
6✔
1592

1593
        md = self.moltype.most_degen_alphabet()
6✔
1594
        gap_index = md.gap_index
6✔
1595
        arr_to_str = md.from_indices
6✔
1596
        if gap_index is None:
6✔
1597
            gap_index = len(self.moltype)
6✔
1598
        seq = numpy.array(self)
6✔
1599
        for i in range(len(seq) - k + 1):
6✔
1600
            kmer = seq[i : i + k]
6✔
1601
            if not strict or kmer.max() < gap_index:
6✔
1602
                yield arr_to_str(kmer)
6✔
1603

1604
    def get_kmers(self, k: int, strict: bool = True) -> list[str]:
6✔
1605
        """return all overlapping k-mers"""
1606
        return list(self.iter_kmers(k, strict))
6✔
1607

1608
    def sliding_windows(
6✔
1609
        self,
1610
        window: int,
1611
        step: int,
1612
        start: int | None = None,
1613
        end: int | None = None,
1614
    ) -> Iterator[Self]:
1615
        """Generator function that yield new sequence objects
1616
        of a given length at a given interval.
1617

1618
        Parameters
1619
        ----------
1620
        window
1621
            The length of the returned sequence
1622
        step
1623
            The interval between the start of the returned
1624
            sequence objects
1625
        start
1626
            first window start position
1627
        end
1628
            last window start position
1629

1630
        """
1631
        if start is None:
×
1632
            start = 0
×
1633
        if end is None:
×
1634
            end = len(self) - window + 1
×
1635
        end = min(len(self) - window + 1, end)
×
1636
        if start < end and len(self) - end >= window - 1:
×
1637
            for pos in range(start, end, step):
×
1638
                yield self[pos : pos + window]
×
1639

1640
    def get_in_motif_size(
6✔
1641
        self, motif_length: int = 1, warn: bool = False
1642
    ) -> list[str] | str:
1643
        """returns sequence as list of non-overlapping motifs
1644

1645
        Parameters
1646
        ----------
1647
        motif_length
1648
            length of the motifs
1649
        warn
1650
            whether to notify of an incomplete terminal motif
1651
        """
1652
        seq: SeqViewABC | str = self._seq
6✔
1653
        if isinstance(seq, SeqViewABC):
6✔
1654
            seq = str(self)
6✔
1655
        if motif_length == 1:
6✔
1656
            return seq
6✔
1657

1658
        length = len(seq)
6✔
1659
        remainder = length % motif_length
6✔
1660
        if remainder and warn:
6✔
1661
            warnings.warn(
×
1662
                f'Dropped remainder "{seq[-remainder:]}" from end of sequence',
1663
                stacklevel=2,
1664
            )
1665
        return [
6✔
1666
            seq[i : i + motif_length]
1667
            for i in range(0, length - remainder, motif_length)
1668
        ]
1669

1670
    def parse_out_gaps(self) -> tuple[IndelMap, Self]:
6✔
1671
        """returns Map corresponding to gap locations and ungapped Sequence"""
1672
        gap = re.compile(f"[{re.escape(cast('str', self.moltype.gap))}]+")
6✔
1673
        seq_str = str(self)
6✔
1674
        gap_pos_list: list[int] = []
6✔
1675
        cum_lengths_list: list[int] = []
6✔
1676
        for match in gap.finditer(seq_str):
6✔
1677
            pos = match.start()
6✔
1678
            gap_pos_list.append(pos)
6✔
1679
            cum_lengths_list.append(match.end() - pos)
6✔
1680

1681
        gap_pos = numpy.array(gap_pos_list)
6✔
1682
        cum_lengths = numpy.array(cum_lengths_list).cumsum()
6✔
1683
        gap_pos[1:] = gap_pos[1:] - cum_lengths[:-1]
6✔
1684

1685
        seq = self.__class__(
6✔
1686
            moltype=self.moltype,
1687
            seq=gap.sub("", seq_str),
1688
            name=self.get_name(),
1689
            info=self.info,
1690
        )
1691
        indel_map = IndelMap(
6✔
1692
            gap_pos=gap_pos,
1693
            cum_gap_lengths=cum_lengths,
1694
            parent_length=len(seq),
1695
        )
1696
        seq.annotation_db = self.annotation_db
6✔
1697
        return indel_map, seq
6✔
1698

1699
    def is_annotated(
6✔
1700
        self,
1701
        biotype: str | tuple[str] | None = None,
1702
    ) -> bool:
1703
        """returns True if sequence parent name has any annotations
1704

1705
        Parameters
1706
        ----------
1707
        biotype
1708
            amend condition to return True only if the sequence is
1709
            annotated with one of provided biotypes.
1710
        """
1711
        if not self._annotation_db:
6✔
1712
            return False
6✔
1713
        with contextlib.suppress(AttributeError):
6✔
1714
            return (
6✔
1715
                self.annotation_db.num_matches(seqid=self._seq.seqid, biotype=biotype)
1716
                != 0
1717
            )
1718
        return False
×
1719

1720
    def annotate_matches_to(
6✔
1721
        self,
1722
        pattern: str,
1723
        biotype: str,
1724
        name: str,
1725
        allow_multiple: bool = False,
1726
    ) -> list[Feature[Sequence]]:
1727
        """Adds an annotation at sequence positions matching pattern.
1728

1729
        Parameters
1730
        ----------
1731
        pattern
1732
            The search string for which annotations are made. IUPAC ambiguities
1733
            are converted to regex on sequences with the appropriate MolType.
1734
        biotype
1735
            The type of the annotation (e.g. "domain").
1736
        name
1737
            The name of the annotation.
1738
        allow_multiple
1739
            If True, allows multiple occurrences of the input pattern. Otherwise,
1740
            only the first match is used.
1741

1742
        Returns
1743
        -------
1744
        Returns a list of Feature instances.
1745
        """
1746
        with contextlib.suppress(ValueError):
6✔
1747
            # assume ValueError is due to already being a regex
1748
            pattern = self.moltype.to_regex(seq=pattern)
6✔
1749

1750
        pos = [m.span() for m in re.finditer(pattern, str(self))]
6✔
1751
        if not pos:
6✔
1752
            return []
6✔
1753

1754
        num_match = len(pos) if allow_multiple else 1
6✔
1755
        return [
6✔
1756
            self.add_feature(
1757
                biotype=biotype,
1758
                name=f"{name}:{i}" if allow_multiple else name,
1759
                spans=[pos[i]],
1760
            )
1761
            for i in range(num_match)
1762
        ]
1763

1764
    def get_drawables(
6✔
1765
        self,
1766
        *,
1767
        biotype: str | Iterable[str] | None = None,
1768
    ) -> dict[str, list[Shape]]:
1769
        """returns a dict of drawables, keyed by type
1770

1771
        Parameters
1772
        ----------
1773
        biotype
1774
            passed to get_features(biotype). Can be a single biotype or
1775
            series. Only features matching this will be included.
1776
        """
1777
        # make sure the drawables are unique by adding to a set
1778
        features = set(self.get_features(biotype=biotype, allow_partial=True))
6✔
1779
        result: dict[str, list[Shape]] = defaultdict(list)
6✔
1780
        for f in features:
6✔
1781
            result[f.biotype].append(f.get_drawable())
6✔
1782
        return result
6✔
1783

1784
    def get_drawable(
6✔
1785
        self,
1786
        *,
1787
        biotype: str | Iterable[str] | None = None,
1788
        width: float = 600,
1789
        vertical: bool = False,
1790
    ) -> Drawable | None:
1791
        """make a figure from sequence features
1792

1793
        Parameters
1794
        ----------
1795
        biotype
1796
            passed to get_features(biotype). Can be a single biotype or
1797
            series. Only features matching this will be included.
1798
        width
1799
            width in pixels
1800
        vertical
1801
            rotates the drawable
1802

1803
        Returns
1804
        -------
1805
        a Drawable instance
1806

1807
        Notes
1808
        -----
1809
        If provided, the biotype is used for plot order.
1810
        """
1811
        from cogent3.draw.drawable import Drawable
6✔
1812

1813
        drawables = self.get_drawables(biotype=biotype)
6✔
1814
        if not drawables:
6✔
1815
            return None
6✔
1816

1817
        biotype = list(drawables) if biotype is None else biotype
6✔
1818
        biotypes = (biotype,) if isinstance(biotype, str) else biotype
6✔
1819

1820
        # we order by tracks
1821
        top: float = 0
6✔
1822
        space = 0.25
6✔
1823
        annotes: list[Shape] = []
6✔
1824
        annott = None
6✔
1825
        for feature_type in biotypes:
6✔
1826
            new_bottom = top + space
6✔
1827
            for i, annott in enumerate(drawables[feature_type]):
6✔
1828
                annott.shift(y=new_bottom - annott.bottom)
6✔
1829
                if i > 0:
6✔
1830
                    annott._showlegend = False
×
1831
                annotes.append(annott)
6✔
1832

1833
            top = cast("Shape", annott).top
6✔
1834

1835
        top += space
6✔
1836
        height = max((top / len(self)) * width, 300)
6✔
1837
        xaxis: dict[str, Any] = {
6✔
1838
            "range": [0, len(self)],
1839
            "zeroline": False,
1840
            "showline": True,
1841
        }
1842
        yaxis: dict[str, Any] = {
6✔
1843
            "range": [0, top],
1844
            "visible": False,
1845
            "zeroline": True,
1846
            "showline": True,
1847
        }
1848

1849
        if vertical:
6✔
1850
            all_traces = [t.T.as_trace() for t in annotes]
6✔
1851
            width, height = height, width
6✔
1852
            xaxis, yaxis = yaxis, xaxis
6✔
1853
        else:
1854
            all_traces = [t.as_trace() for t in annotes]
6✔
1855

1856
        drawer = Drawable(
6✔
1857
            title=self.name,
1858
            traces=all_traces,
1859
            width=width,
1860
            height=height,
1861
        )
1862
        drawer.layout.update(xaxis=xaxis, yaxis=yaxis)
6✔
1863
        return drawer
6✔
1864

1865
    def parent_coordinates(
6✔
1866
        self, apply_offset: bool = False, **kwargs: Any
1867
    ) -> tuple[str, int, int, int]:
1868
        """returns seqid, start, stop, strand of this sequence on its parent
1869

1870
        Parameters
1871
        ----------
1872
        apply_offset
1873
            if True, adds annotation offset from parent
1874

1875
        Notes
1876
        -----
1877
        seqid is the identifier of the parent. Returned coordinates are with
1878
        respect to the plus strand, irrespective of whether the sequence has
1879
        been reversed complemented or not.
1880

1881
        Returns
1882
        -------
1883
        seqid, start, end, strand of this sequence on the parent. strand is either
1884
        -1 or 1.
1885
        """
1886
        strand = Strand.MINUS.value if self._seq.is_reversed else Strand.PLUS.value
6✔
1887
        seqid, start, stop, _ = self._seq.parent_coords(apply_offset=apply_offset)
6✔
1888
        return seqid, start, stop, strand
6✔
1889

1890
    def sample(
6✔
1891
        self,
1892
        *,
1893
        n: int | None = None,
1894
        with_replacement: bool = False,
1895
        motif_length: int = 1,
1896
        randint: Callable[
1897
            [int, int | None, int | None], NumpyIntArrayType
1898
        ] = numpy.random.randint,
1899
        permutation: Callable[[int], NumpyIntArrayType] = numpy.random.permutation,
1900
    ) -> Self:
1901
        """Returns random sample of positions from self, e.g. to bootstrap.
1902

1903
        Parameters
1904
        ----------
1905
        n
1906
            number of positions to sample. If None, all positions are sampled.
1907
        with_replacement
1908
            if True, samples with replacement.
1909
        motif_length
1910
            number of positions to sample as a single motif. Starting point
1911
            of each sampled motif is modulo motif_length in the original sequence.
1912
        randint
1913
            random number generator, default is numpy.randint
1914
        permutation
1915
            function to generate a random permutation of positions, default is
1916
            numpy.permutation
1917

1918
        Notes
1919
        -----
1920
        By default (resampling all positions without replacement), generates
1921
        a permutation of the positions of the alignment.
1922
        """
1923
        population_size = len(self) // motif_length
6✔
1924
        if not with_replacement and n and n > population_size:
6✔
1925
            msg = f"cannot sample without replacement when {n=} > {population_size=}"
6✔
1926
            raise ValueError(msg)
6✔
1927

1928
        n = n or population_size
6✔
1929

1930
        if with_replacement:
6✔
1931
            locations = randint(0, population_size, n)
×
1932
        else:
1933
            locations = permutation(population_size)[:n]
6✔
1934

1935
        if motif_length == 1:
6✔
1936
            positions = locations
6✔
1937
        else:
1938
            positions = numpy.empty(n * motif_length, dtype=int)
6✔
1939
            for i, loc in enumerate(locations):
6✔
1940
                positions[i * motif_length : (i + 1) * motif_length] = range(
6✔
1941
                    loc * motif_length,
1942
                    (loc + 1) * motif_length,
1943
                )
1944

1945
        sampled = numpy.array(self).take(positions)
6✔
1946
        return cast(
6✔
1947
            "Self",
1948
            self.moltype.make_sequence(
1949
                seq=sampled, name=f"{self.name}-randomised", check_seq=False
1950
            ),
1951
        )
1952

1953

1954
class ProteinSequence(Sequence):
6✔
1955
    """Holds the standard Protein sequence."""
1956

1957
    # constructed by PROTEIN moltype
1958

1959

1960
class ByteSequence(Sequence):
6✔
1961
    """Holds the bytes sequence."""
1962

1963
    # constructed by BYTES moltype
1964

1965

1966
class ProteinWithStopSequence(Sequence):
6✔
1967
    """Holds the standard Protein sequence, allows for stop codon."""
1968

1969
    # constructed by PROTEIN_WITH_STOP moltype
1970

1971

1972
class NucleicAcidSequenceMixin(Sequence):
6✔
1973
    """Mixin class for DNA and RNA sequences."""
1974

1975
    def can_pair(self, other: Self) -> bool:
6✔
1976
        """Returns True if self and other could pair.
1977

1978
        other
1979
            sequence, must be able to be cast to string.
1980

1981
        Notes
1982
        -----
1983
        Pairing occurs in reverse order, i.e. last position of other with
1984
        first position of self, etc.
1985

1986
        Truncates at length of shorter sequence.
1987

1988
        Gaps are only allowed to pair with other gaps.
1989

1990
        Weak pairs (like GU) are considered as possible pairs.
1991
        """
1992
        return self.moltype.can_pair(str(self), str(other))
6✔
1993

1994
    def can_mispair(self, other: Self) -> bool:
6✔
1995
        """Returns True if any position in self could mispair with other.
1996

1997
        Notes
1998
        -----
1999
        Pairing occurs in reverse order, i.e. last position of other with
2000
        first position of self, etc.
2001

2002
        Truncates at length of shorter sequence.
2003

2004
        Gaps are always counted as possible mispairs, as are weak pairs like GU.
2005
        """
2006
        return self.moltype.can_mispair(str(self), str(other))
6✔
2007

2008
    def must_pair(self, other: Self) -> bool:
6✔
2009
        """Returns True if all positions in self must pair with other.
2010

2011
        Notes
2012
        -----
2013
        Pairing occurs in reverse order, i.e. last position of other with
2014
        first position of self, etc.
2015
        """
2016
        return not self.moltype.can_mispair(str(self), str(other))
6✔
2017

2018
    def complement(self) -> Self:
6✔
2019
        """Returns complement of self, using data from MolType.
2020

2021
        Always tries to return same type as item: if item looks like a dict,
2022
        will return list of keys.
2023
        """
2024
        return self.__class__(
6✔
2025
            moltype=self.moltype,
2026
            seq=self.moltype.complement(bytes(self)),
2027
            info=self.info,
2028
        )
2029

2030
    def reverse_complement(self) -> Self:
6✔
2031
        """Converts a nucleic acid sequence to its reverse complement.
2032
        Synonymn for rc."""
2033
        return self.rc()
×
2034

2035
    def rc(self) -> Self:
6✔
2036
        """Converts a nucleic acid sequence to its reverse complement."""
2037
        return self.__class__(
6✔
2038
            moltype=self.moltype,
2039
            seq=self._seq[::-1],
2040
            name=self.name,
2041
            info=self.info,
2042
            annotation_db=self.annotation_db,
2043
        )
2044

2045
    def has_terminal_stop(
6✔
2046
        self,
2047
        gc: c3_genetic_code.GeneticCode | int = 1,
2048
        strict: bool = False,
2049
    ) -> bool:
2050
        """Return True if the sequence has a terminal stop codon.
2051

2052
        Parameters
2053
        ----------
2054
        gc
2055
            valid input to c3_genetic_code.get_code(), a genetic code object, number
2056
            or name
2057
        strict
2058
            If True, raises an exception if length not divisible by 3
2059
        """
2060
        gc = c3_genetic_code.get_code(gc)
6✔
2061
        _, s = self.parse_out_gaps()
6✔
2062

2063
        divisible_by_3 = len(s) % 3 == 0
6✔
2064
        if divisible_by_3:
6✔
2065
            end3 = str(s[-3:])
6✔
2066
            return gc.is_stop(end3)
6✔
2067

2068
        if strict:
6✔
2069
            msg = f"{self.name!r} length not divisible by 3"
6✔
2070
            raise c3_alphabet.AlphabetError(msg)
6✔
2071

2072
        return False
6✔
2073

2074
    def trim_stop_codon(
6✔
2075
        self,
2076
        gc: c3_genetic_code.GeneticCode | int = 1,
2077
        strict: bool = False,
2078
    ) -> Self:
2079
        """Removes a terminal stop codon from the sequence
2080

2081
        Parameters
2082
        ----------
2083
        gc
2084
            valid input to c3_genetic_code.get_code(), a genetic code object, number
2085
            or name
2086
        strict
2087
            If True, raises an exception if length not divisible by 3
2088

2089
        Notes
2090
        -----
2091
        If sequence contains gap characters, the result preserves the sequence
2092
        length by adding gap characters at the end.
2093
        """
2094
        if not self.has_terminal_stop(gc=gc, strict=strict):
6✔
2095
            return self
6✔
2096

2097
        gc = c3_genetic_code.get_code(gc)
6✔
2098
        m, s = self.parse_out_gaps()
6✔
2099

2100
        divisible_by_3 = len(s) % 3 == 0
6✔
2101
        if not divisible_by_3:
6✔
2102
            return self
×
2103

2104
        end = str(s[-3:])
6✔
2105

2106
        if not gc.is_stop(end):
6✔
2107
            return self
×
2108

2109
        if not m.num_gaps:
6✔
2110
            # has zero length if no gaps
2111
            return self[:-3]
6✔
2112

2113
        # determine terminal gap needed to fill in the sequence
2114
        str_s = str(self)
6✔
2115
        gaps = "".join(self.moltype.gaps)
6✔
2116
        pattern = f"({'|'.join(gc['*'])})[{gaps}]*$"
6✔
2117
        terminal_stop = re.compile(pattern)
6✔
2118
        if match := terminal_stop.search(str_s):
6✔
2119
            diff = len(str_s) - match.start()
6✔
2120
            str_s = terminal_stop.sub("-" * diff, str_s)
6✔
2121

2122
        result = self.__class__(
6✔
2123
            moltype=self.moltype,
2124
            seq=str_s,
2125
            name=self.name,
2126
            info=self.info,
2127
        )
2128
        result.annotation_db = self.annotation_db
6✔
2129
        return result
6✔
2130

2131
    def get_translation(
6✔
2132
        self,
2133
        gc: c3_genetic_code.GeneticCode | int = 1,
2134
        incomplete_ok: bool = False,
2135
        include_stop: bool = False,
2136
        trim_stop: bool = True,
2137
    ) -> ProteinSequence:
2138
        """translate to amino acid sequence
2139

2140
        Parameters
2141
        ----------
2142
        gc
2143
            valid input to cogent3.get_code(), a genetic code object, number
2144
            or name
2145
        incomplete_ok
2146
            codons that are mixes of nucleotide and gaps converted to '-'.
2147
            codons containing ambiguous nucleotides are translated as 'X'.
2148
            raises a AlphabetError if False
2149
        include_stop
2150
            allows stop codons in translation
2151
        trim_stop
2152
            trims a terminal stop codon if it exists
2153

2154
        Returns
2155
        -------
2156
        sequence of PROTEIN moltype
2157

2158
        Raises
2159
        ------
2160
        AlphabetError if include_stop is False and a stop codon occurs
2161
        """
2162
        if not self.moltype.is_nucleic:
6✔
2163
            msg = f"moltype must be a DNA/RNA, not {self.moltype.name!r}"
×
2164
            raise c3_moltype.MolTypeError(
×
2165
                msg,
2166
            )
2167

2168
        protein = c3_moltype.get_moltype(
6✔
2169
            "protein_with_stop" if include_stop else "protein",
2170
        )
2171
        gc = c3_genetic_code.get_code(gc)
6✔
2172

2173
        if trim_stop:
6✔
2174
            seq = self.trim_stop_codon(gc=gc, strict=not incomplete_ok)
6✔
2175
        else:
2176
            seq = self
6✔
2177

2178
        # since we are realising the view, reverse complementing will be
2179
        # dealt with, so rc=False
2180
        pep = gc.translate(numpy.array(seq), rc=False, incomplete_ok=incomplete_ok)
6✔
2181

2182
        if not include_stop and "*" in pep:
6✔
2183
            msg = f"{self.name!r} has a stop codon in the translation"
6✔
2184
            raise c3_alphabet.AlphabetError(msg)
6✔
2185

2186
        if not incomplete_ok and "X" in pep:
6✔
2187
            msg = (
6✔
2188
                f"{self.name!r} has an incomplete codon or contains an ambiguity, set incomplete_ok=True to "
2189
                "allow translation"
2190
            )
2191
            raise c3_alphabet.AlphabetError(
6✔
2192
                msg,
2193
            )
2194
        return cast("ProteinSequence", protein.make_sequence(seq=pep, name=self.name))
6✔
2195

2196
    def to_rna(self) -> Sequence:
6✔
2197
        """Returns copy of self as RNA."""
2198
        return self.to_moltype("rna")
6✔
2199

2200
    def to_dna(self) -> Sequence:
6✔
2201
        """Returns copy of self as DNA."""
2202
        return self.to_moltype("dna")
6✔
2203

2204
    def strand_symmetry(self, motif_length: int = 1) -> TestResult:
6✔
2205
        """returns G-test for strand symmetry"""
2206
        counts = self.counts(motif_length=motif_length)
6✔
2207
        ssym_pairs = self.moltype.strand_symmetric_motifs(motif_length=motif_length)
6✔
2208

2209
        obs: list[NumpyIntArrayType] = []
6✔
2210
        motifs: list[str] = []
6✔
2211
        for plus, minus in sorted(ssym_pairs):
6✔
2212
            row = numpy.array([counts[plus], counts[minus]], dtype=int)
6✔
2213
            if row.max() == 0:  # we ignore motifs missing on both strands
6✔
2214
                continue
6✔
2215

2216
            obs.append(row)
6✔
2217
            motifs.append(plus)
6✔
2218

2219
        d_array = DictArray.from_array_names(numpy.array(obs), motifs, ["+", "-"])
6✔
2220
        cat = CategoryCounts(d_array)
6✔
2221
        return cat.G_fit()
6✔
2222

2223

2224
class DnaSequence(NucleicAcidSequenceMixin):
6✔
2225
    """Holds the standard DNA sequence."""
2226

2227
    # constructed by DNA moltype
2228

2229

2230
class RnaSequence(NucleicAcidSequenceMixin):
6✔
2231
    """Holds the standard RNA sequence."""
2232

2233
    # constructed by RNA moltype
2234

2235

2236
class SliceRecordABC(ABC):
6✔
2237
    """Abstract base class for recording the history of operations to be applied
2238
    to some underlying data. Provides slicing functionality for the underlying data.
2239

2240
    Parameters
2241
    ----------
2242
    start
2243
        start of the slice (inclusive indexing)
2244
    stop
2245
        stop of the slice (exclusive indexing)
2246
    step
2247
        step of the slice
2248
    offset
2249
        can be set with any additional offset that exists before the start of
2250
        the underlying data
2251
    parent_len
2252
        length of the underlying data (not including offset)
2253
    """
2254

2255
    __slots__ = ("_offset", "start", "step", "stop")
6✔
2256

2257
    @abstractmethod
6✔
2258
    def __init__(
6✔
2259
        self,
2260
        *,
2261
        parent_len: int,
2262
        start: int | None = None,
2263
        stop: int | None = None,
2264
        step: int | None = None,
2265
        offset: int = 0,
2266
    ) -> None:
2267
        self.start: int
×
2268
        self.stop: int
×
2269
        self.step: int
×
2270

2271
    @abstractmethod
2272
    def __eq__(self, other: object) -> bool: ...
2273

2274
    @abstractmethod
2275
    def __ne__(self, other: object) -> bool: ...
2276
    @property
2277
    @abstractmethod
2278
    def parent_len(self) -> int: ...
2279

2280
    @abstractmethod
6✔
2281
    def _get_init_kwargs(self) -> dict[str, Any]:
6✔
2282
        """return required arguments for construction that are unique to the
2283
        subclass"""
2284
        ...
2285

2286
    @abstractmethod
2287
    def copy(self) -> Self: ...
2288

2289
    # refactor: design
2290
    # can we remove the need for this method on the ABC and inheriting
2291
    @property
2292
    @abstractmethod
2293
    def _zero_slice(self) -> Self: ...
2294

2295
    @property
6✔
2296
    def offset(self) -> int:
6✔
2297
        return self._offset
6✔
2298

2299
    @offset.setter
6✔
2300
    def offset(self, value: int) -> None:
6✔
2301
        value = value or 0
6✔
2302
        self._offset = int(value)
6✔
2303

2304
    @property
6✔
2305
    def plus_start(self) -> int:
6✔
2306
        """start on plus strand"""
2307
        if self.is_reversed:
6✔
2308
            # all indices are negative, so the stop becomes the start. To convert
2309
            # to positive indices, we add the length of the parent sequence to
2310
            # the stop. Note that when abs(self.step) > 1, we instead calculate
2311
            # the "true stop", i.e., the index that immediately follows (in the
2312
            # direction of the step) the last selected index in the slice. We then
2313
            # add the abs(self.step) to account for the fact that the stop uses
2314
            # exclusive indexing, and the start is inclusive.
2315

2316
            assert self.stop < 0, "expected stop on reverse strand SeqView < 0"
6✔
2317
            stop = self.stop if self.step == -1 else self.start + len(self) * self.step
6✔
2318
            start = self.parent_len + stop + abs(self.step)
6✔
2319
        else:
2320
            start = self.start
6✔
2321
        return start
6✔
2322

2323
    @property
6✔
2324
    def plus_stop(self) -> int:
6✔
2325
        """stop on plus strand"""
2326
        if self.is_reversed:
6✔
2327
            # self.start becomes the stop, self.start will be negative
2328
            assert self.start < 0, "expected start on reverse strand SeqView < 0"
6✔
2329
            stop = self.start + self.parent_len + 1
6✔
2330
        else:
2331
            # we ensure that the plus_stop is the index that immediately follows
2332
            # the last selected index in the slice.
2333
            stop = (
6✔
2334
                self.stop
2335
                if self.step == 1
2336
                else (self.start + len(self) * self.step - self.step) + 1
2337
            )
2338
        return stop
6✔
2339

2340
    @property
6✔
2341
    def plus_step(self) -> int:
6✔
2342
        """step on plus strand"""
2343
        return abs(self.step)
6✔
2344

2345
    @property
6✔
2346
    def parent_start(self) -> int:
6✔
2347
        """returns the start on the parent plus strand
2348

2349
        Returns
2350
        -------
2351
        offset + start, taking into account whether reversed. Result
2352
        is positive.
2353

2354
        Notes
2355
        -----
2356
        This should NOT be used for slicing on the parent object.
2357
        """
2358
        return self.offset + self.plus_start
6✔
2359

2360
    @property
6✔
2361
    def is_reversed(self) -> bool:
6✔
2362
        return self.step < 0
6✔
2363

2364
    @property
6✔
2365
    def parent_stop(self) -> int:
6✔
2366
        """returns the stop on the parent plus strand
2367

2368
        Returns
2369
        -------
2370
        offset + stop, taking into account whether reversed. Result
2371
        is positive.
2372

2373
        Notes
2374
        -----
2375
        This should NOT be used for slicing on the parent object.
2376
        """
2377
        return self.offset + self.plus_stop
6✔
2378

2379
    def absolute_position(self, rel_index: int, include_boundary: bool = False) -> int:
6✔
2380
        """Converts an index relative to the current view to be with respect
2381
        to the coordinates of the original "Python sequence".
2382

2383
        Parameters
2384
        ----------
2385
        rel_index
2386
            relative position with respect to the current view
2387
        include_boundary
2388
            whether considering index as part of a range
2389

2390
        Returns
2391
        -------
2392
        the absolute index with respect to the coordinates of the original
2393
        sequence (including offset if present).
2394
        """
2395
        if not self:
6✔
2396
            return 0
6✔
2397

2398
        if rel_index < 0:
6✔
2399
            msg = "only positive indexing supported!"
6✔
2400
            raise IndexError(msg)
6✔
2401

2402
        # _get_index return the absolute position relative to the underlying sequence
2403
        seq_index, _, _ = self._get_index(rel_index, include_boundary=include_boundary)
6✔
2404

2405
        offset = self.offset
6✔
2406
        return (
6✔
2407
            offset + self.parent_len + seq_index + 1
2408
            if self.is_reversed
2409
            else offset + seq_index
2410
        )
2411

2412
    def relative_position(self, abs_index: int, stop: bool = False) -> int:
6✔
2413
        """converts an index on the original "Python sequence" into an index
2414
        on this "view"
2415

2416
        Notes
2417
        -----
2418
        The returned value DOES NOT reflect python indexing. Importantly,
2419
        negative values represent positions that precede the current view.
2420
        """
2421
        if not self:
6✔
2422
            return 0
6✔
2423

2424
        if abs_index < 0:
6✔
2425
            msg = "Index must be +ve and relative to the + strand"
6✔
2426
            raise IndexError(msg)
6✔
2427

2428
        if self.is_reversed:
6✔
2429
            offset = self.offset
6✔
2430

2431
            if (
6✔
2432
                tmp := (self.parent_len - abs_index + offset + self.start + 1)
2433
            ) % self.step == 0 or stop:
2434
                rel_pos = tmp // abs(self.step)
6✔
2435
            else:
2436
                rel_pos = (tmp // abs(self.step)) + 1
×
2437

2438
        else:
2439
            offset = self.offset + self.start
6✔
2440

2441
            if (tmp := abs_index - offset) % self.step == 0 or stop:
6✔
2442
                rel_pos = tmp // self.step
6✔
2443
            else:
2444
                rel_pos = (tmp // self.step) + 1
6✔
2445

2446
        return rel_pos
6✔
2447

2448
    def __len__(self) -> int:
6✔
2449
        return abs((self.start - self.stop) // self.step)
6✔
2450

2451
    def __getitem__(self, segment: SupportsIndex | slice) -> Self:
6✔
2452
        kwargs = self._get_init_kwargs()
6✔
2453

2454
        if isinstance(segment, SupportsIndex):
6✔
2455
            start, stop, step = self._get_index(segment)
6✔
2456
            return self.__class__(
6✔
2457
                start=start,
2458
                stop=stop,
2459
                step=step,
2460
                offset=self.offset,
2461
                parent_len=self.parent_len,
2462
                **kwargs,
2463
            )
2464

2465
        if segment.start is segment.stop is segment.step is None:
6✔
2466
            return self.copy()
6✔
2467

2468
        if len(self) == 0:
6✔
2469
            return self
6✔
2470

2471
        if segment.start is not None and segment.start == segment.stop:
6✔
2472
            return self._zero_slice
6✔
2473

2474
        slice_step = 1 if segment.step is None else segment.step
6✔
2475

2476
        if slice_step > 0:
6✔
2477
            return self._get_slice(segment, slice_step, **kwargs)
6✔
2478
        if slice_step < 0:
6✔
2479
            return self._get_reverse_slice(segment, slice_step, **kwargs)
6✔
2480
        msg = f"{self.__class__.__name__} cannot be sliced with a step of 0"
6✔
2481
        raise ValueError(
6✔
2482
            msg,
2483
        )
2484

2485
    def _get_index(
6✔
2486
        self,
2487
        val: SupportsIndex,
2488
        include_boundary: bool = False,
2489
    ) -> tuple[int, int, int]:
2490
        if len(self) == 0:
6✔
2491
            raise IndexError(val)
6✔
2492

2493
        val = int(val)
6✔
2494
        if (
6✔
2495
            (val > 0 and include_boundary and val > len(self))
2496
            or (val > 0 and not include_boundary and val >= len(self))
2497
            or (val < 0 and include_boundary and abs(val) > (len(self) + 1))
2498
            or (val < 0 and not include_boundary and abs(val) > len(self))
2499
        ):
2500
            raise IndexError(val)
6✔
2501

2502
        if self.step > 0:
6✔
2503
            if val >= 0:
6✔
2504
                val = self.start + val * self.step
6✔
2505
            else:
2506
                val = self.start + len(self) * self.step + val * abs(self.step)
6✔
2507

2508
            return val, val + 1, 1
6✔
2509

2510
        if self.step < 0:
6✔
2511
            if val >= 0:
6✔
2512
                val = self.start + val * self.step
6✔
2513
            else:
2514
                val = self.start + len(self) * self.step + val * self.step
6✔
2515

2516
            return val, val - 1, -1
6✔
2517
        msg = f"Invalid step: {self.step}"
×
2518
        raise ValueError(msg)
×
2519

2520
    def _get_slice(self, segment: slice, step: int, **kwargs: Any) -> Self:
6✔
2521
        slice_start = segment.start if segment.start is not None else 0
6✔
2522
        slice_stop = segment.stop if segment.stop is not None else len(self)
6✔
2523

2524
        if self.step > 0:
6✔
2525
            return self._get_forward_slice_from_forward_seqview_(
6✔
2526
                slice_start,
2527
                slice_stop,
2528
                step,
2529
                **kwargs,
2530
            )
2531

2532
        if self.step < 0:
6✔
2533
            return self._get_forward_slice_from_reverse_seqview_(
6✔
2534
                slice_start,
2535
                slice_stop,
2536
                step,
2537
                **kwargs,
2538
            )
2539
        msg = f"Invalid step: {self.step}"
×
2540
        raise ValueError(msg)
×
2541

2542
    def _get_forward_slice_from_forward_seqview_(
6✔
2543
        self,
2544
        slice_start: int,
2545
        slice_stop: int,
2546
        step: int,
2547
        **kwargs: Any,
2548
    ) -> Self:
2549
        start = (
6✔
2550
            self.start + slice_start * self.step
2551
            if slice_start >= 0
2552
            else max(
2553
                self.start + len(self) * self.step + slice_start * self.step,
2554
                self.start,
2555
            )
2556
        )
2557
        if slice_stop > self.stop:
6✔
2558
            stop = self.stop
6✔
2559
        elif slice_stop >= 0:
6✔
2560
            stop = self.start + slice_stop * self.step
6✔
2561
        else:
2562
            # "true stop" adjust for if abs(stop-start) % step != 0
2563
            # "true stop" = self.start + len(self) * self.step
2564
            stop = self.start + len(self) * self.step + slice_stop * self.step
6✔
2565

2566
        # if -ve, it's an invalid slice
2567
        if start < 0 or stop < 0:
6✔
2568
            return self._zero_slice
6✔
2569

2570
        # checking for zero-length slice
2571
        if stop < start:
6✔
2572
            return self._zero_slice
6✔
2573
        if start > self.parent_len:
6✔
2574
            return self._zero_slice
6✔
2575

2576
        return self.__class__(
6✔
2577
            start=start,
2578
            stop=min(self.stop, stop),
2579
            step=self.step * step,
2580
            offset=self.offset,
2581
            parent_len=self.parent_len,
2582
            **kwargs,
2583
        )
2584

2585
    def _get_forward_slice_from_reverse_seqview_(
6✔
2586
        self,
2587
        slice_start: int,
2588
        slice_stop: int,
2589
        step: int,
2590
        **kwargs: Any,
2591
    ) -> Self:
2592
        if slice_start >= 0:
6✔
2593
            start = self.start + slice_start * self.step
6✔
2594
        elif abs(slice_start) > len(self):
6✔
2595
            start = self.start
6✔
2596
        else:
2597
            start = self.start + len(self) * self.step + slice_start * self.step
6✔
2598

2599
        if slice_stop >= 0:
6✔
2600
            stop = self.start + slice_stop * self.step
6✔
2601
        else:  # slice_stop < 0
2602
            stop = self.start + len(self) * self.step + slice_stop * self.step
6✔
2603

2604
        # if +ve, it's an invalid slice
2605
        if start >= 0 or stop >= 0:
6✔
2606
            return self._zero_slice
6✔
2607

2608
        return self.__class__(
6✔
2609
            start=start,
2610
            stop=max(self.stop, stop),
2611
            step=self.step * step,
2612
            offset=self.offset,
2613
            parent_len=self.parent_len,
2614
            **kwargs,
2615
        )
2616

2617
    def _get_reverse_slice(
6✔
2618
        self,
2619
        segment: slice,
2620
        step: int,
2621
        **kwargs: Any,
2622
    ) -> Self:
2623
        slice_start = segment.start if segment.start is not None else -1
6✔
2624
        slice_stop = segment.stop if segment.stop is not None else -len(self) - 1
6✔
2625

2626
        if self.step < 0:
6✔
2627
            return self._get_reverse_slice_from_reverse_seqview_(
6✔
2628
                slice_start,
2629
                slice_stop,
2630
                step,
2631
                **kwargs,
2632
            )
2633
        if self.step > 0:
6✔
2634
            return self._get_reverse_slice_from_forward_seqview_(
6✔
2635
                slice_start,
2636
                slice_stop,
2637
                step,
2638
                **kwargs,
2639
            )
2640

2641
        msg = f"Invalid step: {self.step}"
×
2642
        raise ValueError(msg)
×
2643

2644
    def _get_reverse_slice_from_forward_seqview_(
6✔
2645
        self,
2646
        slice_start: int,
2647
        slice_stop: int,
2648
        step: int,
2649
        **kwargs: Any,
2650
    ) -> Self:
2651
        # "true stop" adjust for if abs(stop-start) % step != 0
2652
        # max possible start is "true stop" - step, because stop is not inclusive
2653
        # "true stop" - step is converted to -ve index via subtracting len(self)
2654
        if slice_start >= len(self):
6✔
2655
            start = (self.start + len(self) * self.step - self.step) - self.parent_len
6✔
2656
        elif slice_start >= 0:
6✔
2657
            start = (self.start + slice_start * self.step) - self.parent_len
6✔
2658
        else:
2659
            start = (
6✔
2660
                self.start
2661
                + len(self) * self.step
2662
                + slice_start * self.step
2663
                - self.parent_len
2664
            )
2665

2666
        if slice_stop >= self.parent_len:
6✔
2667
            return self._zero_slice
6✔
2668

2669
        if slice_stop >= 0:
6✔
2670
            stop = self.start + (slice_stop * self.step) - self.parent_len
6✔
2671
        else:
2672
            stop = (
6✔
2673
                self.start
2674
                + (len(self) * self.step)
2675
                + (slice_stop * self.step)
2676
                - self.parent_len
2677
            )
2678

2679
        if start >= 0 or stop >= 0:
6✔
2680
            return self._zero_slice
6✔
2681

2682
        return self.__class__(
6✔
2683
            start=start,
2684
            stop=max(stop, self.start - self.parent_len - 1),
2685
            step=self.step * step,
2686
            offset=self.offset,
2687
            parent_len=self.parent_len,
2688
            **kwargs,
2689
        )
2690

2691
    def _get_reverse_slice_from_reverse_seqview_(
6✔
2692
        self,
2693
        slice_start: int,
2694
        slice_stop: int,
2695
        step: int,
2696
        **kwargs: Any,
2697
    ) -> Self:
2698
        # refactor: simplify
2699
        # are there other places in slice_record where we can use plus_start/stop/step
2700
        if slice_start >= len(self):
6✔
2701
            start = self.plus_start
6✔
2702
        elif slice_start >= 0:
6✔
2703
            start = self.parent_len + (self.start + slice_start * self.step)
6✔
2704
        else:
2705
            start = self.parent_len + (
6✔
2706
                self.start + len(self) * self.step + slice_start * self.step
2707
            )
2708

2709
        if slice_stop >= 0:
6✔
2710
            stop = self.parent_len + (self.start + slice_stop * self.step)
6✔
2711
            if stop <= self.parent_len + self.stop:
6✔
2712
                return self._zero_slice
6✔
2713
        else:
2714
            stop = self.parent_len + (
6✔
2715
                self.start + len(self) * self.step + slice_stop * self.step
2716
            )
2717
            if stop > self.parent_len + self.start:
6✔
2718
                stop = self.parent_len + self.start + 1
6✔
2719

2720
        # if -ve, it's an invalid slice becomes zero
2721
        # checking for zero-length slice
2722
        if stop < start or start > self.parent_len or min(start, stop) < 0:
6✔
2723
            return self._zero_slice
6✔
2724

2725
        return self.__class__(
6✔
2726
            start=start,
2727
            stop=stop,
2728
            step=self.step * step,
2729
            offset=self.offset,
2730
            parent_len=self.parent_len,
2731
            **kwargs,
2732
        )
2733

2734

2735
class SliceRecord(SliceRecordABC):
6✔
2736
    """records cumulative slice operations on an object without modifying it
2737

2738
    Notes
2739
    -----
2740
    Basis for lazy evaluation of slicing operations on sequences and alignments.
2741
    A reference to an instance of this class is used by different view objects.
2742
    """
2743

2744
    __slots__ = ("_parent_len",)
6✔
2745

2746
    def __init__(
6✔
2747
        self,
2748
        *,
2749
        parent_len: int,
2750
        start: int | None = None,
2751
        stop: int | None = None,
2752
        step: int | None = None,
2753
        offset: int = 0,
2754
    ) -> None:
2755
        """
2756
        Parameters
2757
        ----------
2758
        start
2759
            start of the slice (inclusive indexing)
2760
        stop
2761
            stop of the slice (exclusive indexing)
2762
        step
2763
            step of the slice
2764
        offset
2765
            can be set with any additional offset that exists before the start of
2766
            the underlying data
2767
        parent_len
2768
            length of the underlying data (not including offset)
2769
        """
2770
        if step == 0:
6✔
2771
            msg = "step cannot be 0"
6✔
2772
            raise ValueError(msg)
6✔
2773
        step = step if step is not None else 1
6✔
2774
        func = _input_vals_pos_step if step > 0 else _input_vals_neg_step
6✔
2775
        start, stop, step = func(parent_len, start, stop, step)
6✔
2776
        self.start = start
6✔
2777
        self.stop = stop
6✔
2778
        self.step = step
6✔
2779
        self._parent_len = parent_len
6✔
2780
        self._offset = offset or 0
6✔
2781

2782
    def __eq__(self, other: object) -> bool:
6✔
2783
        if not isinstance(other, SliceRecordABC):
6✔
2784
            return False
×
2785
        return (
6✔
2786
            self.start == other.start
2787
            and self.stop == other.stop
2788
            and self.step == other.step
2789
            and self._parent_len == other.parent_len
2790
            and self._offset == other.offset
2791
        )
2792

2793
    def __ne__(self, other: object) -> bool:
6✔
2794
        return not self == other
×
2795

2796
    @property
6✔
2797
    def parent_len(self) -> int:
6✔
2798
        return self._parent_len
6✔
2799

2800
    def _get_init_kwargs(self) -> dict[str, Any]:
6✔
2801
        return {}
6✔
2802

2803
    def copy(self, sliced: bool = False) -> Self:
6✔
2804
        """returns self"""
2805
        return self
6✔
2806

2807
    def __repr__(self) -> str:
6✔
2808
        return (
6✔
2809
            f"{self.__class__.__name__}(start={self.start}, stop={self.stop}, step={self.step}, "
2810
            f"parent_len={self.parent_len}, offset={self.offset})"
2811
        )
2812

2813
    @property
6✔
2814
    def _zero_slice(self) -> Self:
6✔
2815
        return self.__class__(start=0, stop=0, step=1, parent_len=self.parent_len)
6✔
2816

2817
    def to_rich_dict(self) -> dict[str, str | dict[str, int]]:
6✔
2818
        """returns dict suitable for serialisation"""
2819
        data: dict[str, str | dict[str, int]] = {
6✔
2820
            "type": get_object_provenance(self),
2821
            "version": __version__,
2822
        }
2823
        data["init_args"] = {
6✔
2824
            "parent_len": self.parent_len,
2825
            "start": self.start,
2826
            "stop": self.stop,
2827
            "step": self.step,
2828
            "offset": self.offset,
2829
        }
2830
        return data
6✔
2831

2832

2833
class SeqViewABC(ABC):
6✔
2834
    """
2835
    An abstract base class for providing a view of a sequence.
2836

2837
    This class defines an interface for sequence views, by which operations
2838
    performed on the sequence do not modify the original sequence data. Instead,
2839
    modifications and operations are stored on the returned view of the sequence
2840
    and can be realised by accessing the values of the view.
2841
    """
2842

2843
    __slots__ = ("_parent_len", "_slice_record", "alphabet", "parent")
6✔
2844

2845
    def __init__(self) -> None:
6✔
2846
        self.alphabet: c3_alphabet.CharAlphabet[Any]
×
2847
        self.parent: str | bytes | NumpyIntArrayType
×
2848
        self._parent_len: int
×
2849
        self._slice_record: SliceRecordABC
×
2850

2851
    @property
2852
    @abstractmethod
2853
    def seqid(self) -> str | None: ...
2854

2855
    @property
2856
    @abstractmethod
2857
    def parent_len(self) -> int: ...
2858

2859
    @property
2860
    @abstractmethod
2861
    def slice_record(self) -> SliceRecordABC: ...
2862

2863
    @property
6✔
2864
    def parent_offset(self) -> int:
6✔
2865
        """returns the offset from the true parent
2866

2867
        Notes
2868
        -----
2869
        If from storage with an offset attribute, returns
2870
        that value or 0
2871
        """
2872
        return (
6✔
2873
            self.parent.offset.get(self.seqid, 0)
2874
            if hasattr(self.parent, "offset")
2875
            else 0
2876
        )
2877

2878
    @property
2879
    @abstractmethod
2880
    def offset(self) -> int: ...
2881

2882
    @property
6✔
2883
    def is_reversed(self) -> bool:
6✔
2884
        """whether the sequence is reversed"""
2885
        return self.slice_record.is_reversed
6✔
2886

2887
    @property
2888
    @abstractmethod
2889
    def str_value(self) -> str: ...
2890

2891
    @property
2892
    @abstractmethod
2893
    def array_value(self) -> NumpyIntArrayType: ...
2894

2895
    @property
2896
    @abstractmethod
2897
    def bytes_value(self) -> bytes: ...
2898

2899
    @abstractmethod
2900
    def copy(self, sliced: bool = False) -> Self: ...
2901

2902
    @abstractmethod
2903
    def __str__(self) -> str: ...
2904

2905
    @abstractmethod
2906
    def __array__(
2907
        self,
2908
        dtype: type[numpy.integer] | None = None,
2909
        copy: bool | None = None,
2910
    ) -> NumpyIntArrayType: ...
2911

2912
    @abstractmethod
2913
    def __bytes__(self) -> bytes: ...
2914

2915
    @abstractmethod
2916
    def __getitem__(self, segment: SupportsIndex | slice) -> SeqViewABC: ...
2917

2918
    def __len__(self) -> int:
6✔
2919
        return len(self.slice_record)
6✔
2920

2921
    def _get_init_kwargs(self) -> dict[str, Any]:
6✔
2922
        return {
×
2923
            "parent": self.parent,
2924
            "parent_len": self._parent_len,
2925
            "seqid": self.seqid,
2926
            "alphabet": self.alphabet,
2927
            "slice_record": self.slice_record,
2928
        }
2929

2930
    def with_offset(self, offset: int) -> Self:
6✔
2931
        """returns new instance with annotation offset set"""
2932
        if self._slice_record.offset:
6✔
2933
            msg = f"cannot set {offset=} on a SeqView with an offset {self._slice_record.offset=}"
6✔
2934
            raise ValueError(
6✔
2935
                msg,
2936
            )
2937

2938
        init_kwargs = self._get_init_kwargs()
6✔
2939
        init_kwargs["offset"] = offset
6✔
2940
        return self.__class__(**init_kwargs)
6✔
2941

2942
    @abstractmethod
2943
    def parent_coords(
2944
        self, *, apply_offset: bool = False, **kwargs: Any
2945
    ) -> tuple[str, int, int, int]: ...
2946

2947

2948
class SeqView(SeqViewABC):
6✔
2949
    """
2950
    Provides a view of a sequence with support for slicing operations.
2951

2952
    This class represents a view of a sequence. It uses ``SliceRecord`` to
2953
    enable efficient slicing without altering the original sequence data.
2954

2955
    Parameters
2956
    ----------
2957
    parent
2958
        the original sequence data
2959
    alphabet
2960
        the alphabet object defining valid characters for the sequence
2961
    seqid
2962
        the name or identifier of the sequence
2963
    parent_len
2964
        the length of the sequence. Defaults to the length of the input sequence
2965

2966
    Notes
2967
    -----
2968
    It utilises the alphabet object to allow providing different types
2969
    such as strings or numpy arrays, corresponding to its underlying data.
2970
    """
2971

2972
    __slots__ = (
6✔
2973
        "_parent_len",
2974
        "_seqid",
2975
        "_slice_record",
2976
        "alphabet",
2977
        "parent",
2978
    )
2979

2980
    def __init__(
6✔
2981
        self,
2982
        *,
2983
        parent: str | bytes | NumpyIntArrayType,
2984
        alphabet: c3_alphabet.CharAlphabet[Any],
2985
        parent_len: int,
2986
        seqid: str | None = None,
2987
        slice_record: SliceRecordABC | None = None,
2988
        offset: int = 0,
2989
    ) -> None:
2990
        self.alphabet = alphabet
6✔
2991
        self.parent = parent
6✔
2992
        self._seqid = seqid
6✔
2993
        assert parent_len is not None
6✔
2994
        self._parent_len = parent_len
6✔
2995
        self._slice_record = (
6✔
2996
            slice_record
2997
            if slice_record is not None
2998
            else SliceRecord(parent_len=self._parent_len)
2999
        )
3000
        if offset and self._slice_record.offset:
6✔
3001
            msg = f"cannot set {offset=} on a SeqView with an offset {self._slice_record.offset=}"
×
3002
            raise ValueError(
×
3003
                msg,
3004
            )
3005
        if offset:
6✔
3006
            self._slice_record.offset = offset
6✔
3007

3008
    @property
6✔
3009
    def seqid(self) -> str | None:
6✔
3010
        """name of the sequence"""
3011
        return self._seqid
6✔
3012

3013
    @property
6✔
3014
    def slice_record(self) -> SliceRecordABC:
6✔
3015
        """the slice state of this view"""
3016
        return self._slice_record
6✔
3017

3018
    @property
6✔
3019
    def parent_len(self) -> int:
6✔
3020
        """length of the parent sequence"""
3021
        return self._parent_len
6✔
3022

3023
    @property
6✔
3024
    def offset(self) -> int:
6✔
3025
        """the annotation offset of this view"""
3026
        return self.slice_record.offset
6✔
3027

3028
    def _get_init_kwargs(self) -> dict[str, Any]:
6✔
3029
        return {
6✔
3030
            "parent": self.parent,
3031
            "parent_len": self._parent_len,
3032
            "seqid": self.seqid,
3033
            "alphabet": self.alphabet,
3034
            "slice_record": self.slice_record,
3035
        }
3036

3037
    @property
6✔
3038
    def str_value(self) -> str:
6✔
3039
        """returns the sequence as a string"""
3040
        return self.alphabet.from_indices(
6✔
3041
            self.parent[
3042
                self.slice_record.start : self.slice_record.stop : self.slice_record.step
3043
            ],
3044
        )
3045

3046
    @property
6✔
3047
    def array_value(self) -> NumpyIntArrayType:
6✔
3048
        """returns the sequence as a array of indices"""
3049
        return self.alphabet.to_indices(
6✔
3050
            self.parent[
3051
                self.slice_record.start : self.slice_record.stop : self.slice_record.step
3052
            ],
3053
            validate=False,
3054
        )
3055

3056
    @property
6✔
3057
    def bytes_value(self) -> bytes:
6✔
3058
        """returns the sequence as a bytes string"""
3059
        return self.str_value.encode("utf-8")
×
3060

3061
    def __str__(self) -> str:
6✔
3062
        return self.str_value
6✔
3063

3064
    def __array__(
6✔
3065
        self,
3066
        dtype: type[numpy.integer] | None = None,
3067
        copy: bool | None = None,
3068
    ) -> NumpyIntArrayType:
3069
        arr = self.array_value
6✔
3070
        if dtype is not None:
6✔
3071
            arr = arr.astype(dtype)
×
3072
        return arr
6✔
3073

3074
    def __bytes__(self) -> bytes:
6✔
3075
        return self.bytes_value
6✔
3076

3077
    def __getitem__(self, segment: SupportsIndex | slice) -> Self:
6✔
3078
        return self.__class__(
6✔
3079
            parent=self.parent,
3080
            seqid=self.seqid,
3081
            alphabet=self.alphabet,
3082
            parent_len=self.parent_len,
3083
            slice_record=self.slice_record[segment],
3084
        )
3085

3086
    def __repr__(self) -> str:
6✔
3087
        seq_preview = (
6✔
3088
            f"{self.parent[:10]}...{self.parent[-5:]}"  # type: ignore[str-bytes-safe]
3089
            if self.parent_len > 15
3090
            else self.parent
3091
        )
3092
        return (
6✔
3093
            f"{self.__class__.__name__}(seqid={self.seqid!r}, parent={seq_preview!r}, "
3094
            f"slice_record={self.slice_record.__repr__()})"
3095
        )
3096

3097
    def copy(self, sliced: bool = False) -> Self:
6✔
3098
        """returns copy
3099

3100
        Parameters
3101
        ----------
3102
        sliced
3103
            if True, the underlying sequence is truncated and the start/stop
3104
            adjusted
3105
        """
3106
        if not sliced:
6✔
3107
            return self.__class__(
6✔
3108
                parent=self.parent,
3109
                seqid=self.seqid,
3110
                alphabet=self.alphabet,
3111
                slice_record=self.slice_record.copy(),
3112
                parent_len=self.parent_len,
3113
            )
3114
        # we slice parent with the plus attributes only, so, if reversed, we need to
3115
        # retain that information in the slice_record
3116
        sr = SliceRecord(parent_len=len(self), step=-1) if self.is_reversed else None
6✔
3117
        return self.__class__(
6✔
3118
            parent=self.parent[
3119
                self.slice_record.plus_start : self.slice_record.plus_stop : self.slice_record.plus_step
3120
            ],
3121
            parent_len=len(self),
3122
            seqid=self.seqid,
3123
            alphabet=self.alphabet,
3124
            slice_record=sr,
3125
        )
3126

3127
    def parent_coords(
6✔
3128
        self, *, apply_offset: bool = False, **kwargs: Any
3129
    ) -> tuple[str, int, int, int]:
3130
        """returns coordinates on parent
3131

3132
        Parameters
3133
        ----------
3134
        apply_offset
3135
            if True adds annotation offset from parent
3136

3137
        Returns
3138
        -------
3139
        parent seqid, start, stop, strand
3140
        """
3141
        offset = self.parent_offset if apply_offset else 0
6✔
3142
        return (
6✔
3143
            cast("str", self.seqid),
3144
            self.slice_record.parent_start + offset,
3145
            self.slice_record.parent_stop + offset,
3146
            self.slice_record.step,
3147
        )
3148

3149

3150
def _coerce_to_seqview(
6✔
3151
    data: Aligned
3152
    | SeqViewABC
3153
    | Sequence
3154
    | str
3155
    | bytes
3156
    | NumpyIntArrayType
3157
    | tuple[str, ...]
3158
    | list[str],
3159
    seqid: str | None,
3160
    alphabet: c3_alphabet.CharAlphabet[Any],
3161
    offset: int,
3162
) -> SeqViewABC:
3163
    from cogent3.core.alignment import Aligned
6✔
3164

3165
    if isinstance(data, Sequence):
6✔
3166
        data = data._seq
6✔
3167

3168
    if isinstance(data, SeqViewABC):
6✔
3169
        # we require the indexes of shared states in alphabets to be the same
3170
        # SeqView has an alphabet but SeqViewABC does NOT because that is
3171
        # more general and covers the case where the SeqsData collection has the
3172
        # alphabet
3173
        if hasattr(data, "alphabet"):
6✔
3174
            n = min(len(data.alphabet), len(alphabet))
6✔
3175
            if data.alphabet[:n] != alphabet[:n]:
6✔
3176
                msg = f"element order {data.alphabet=} != to that in {alphabet=} for {data=!r}"
6✔
3177
                raise c3_alphabet.AlphabetError(
6✔
3178
                    msg,
3179
                )
3180

3181
        if offset and data.offset:
6✔
3182
            msg = f"cannot set {offset=} on a SeqView with an offset {data.offset=}"
×
3183
            raise ValueError(
×
3184
                msg,
3185
            )
3186
        if offset:
6✔
3187
            return data.with_offset(offset)
6✔
3188
        return data
6✔
3189

3190
    if isinstance(data, (tuple, list)):
6✔
3191
        data = "".join(data)
×
3192

3193
    if isinstance(data, Aligned):
6✔
3194
        data = str(data)
×
3195

3196
    if isinstance(data, bytes):
6✔
3197
        data = data.decode("utf8")
6✔
3198

3199
    if isinstance(data, str):
6✔
3200
        return SeqView(
6✔
3201
            parent=data,
3202
            parent_len=len(data),
3203
            seqid=seqid,
3204
            alphabet=alphabet,
3205
            offset=offset,
3206
        )
3207

3208
    if isinstance(data, numpy.ndarray):
6✔
3209
        return SeqView(
6✔
3210
            parent=data.astype(alphabet.dtype),
3211
            parent_len=len(data),
3212
            seqid=seqid,
3213
            alphabet=alphabet,
3214
            offset=offset,
3215
        )
3216

3217
    msg = f"{type(data)}"
×
3218
    raise NotImplementedError(msg)
×
3219

3220

3221
cls_map = {
6✔
3222
    get_object_provenance(cls): cls
3223
    for cls in (
3224
        Sequence,
3225
        DnaSequence,
3226
        RnaSequence,
3227
        ProteinSequence,
3228
        ProteinWithStopSequence,
3229
        ByteSequence,
3230
    )
3231
} | {
3232
    "cogent3.core.sequence.Sequence": Sequence,
3233
    "cogent3.core.sequence.DnaSequence": DnaSequence,
3234
    "cogent3.core.sequence.RnaSequence": RnaSequence,
3235
    "cogent3.core.sequence.ProteinSequence": ProteinSequence,
3236
    "cogent3.core.sequence.ProteinWithStopSequence": ProteinWithStopSequence,
3237
    "cogent3.core.sequence.ByteSequence": ByteSequence,
3238
}
3239

3240

3241
@register_deserialiser(*cls_map.keys())
6✔
3242
def deserialise_sequence(data: dict[str, Any]) -> Sequence:
6✔
3243
    cls = cls_map[data["type"]]
6✔
3244
    return cls.from_rich_dict(data)
6✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc