• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rmcar17 / cogent3 / 17852834425

19 Sep 2025 08:22AM UTC coverage: 90.681% (+0.009%) from 90.672%
17852834425

push

github

rmcar17
TST: Convert dict views to lists

28257 of 31161 relevant lines covered (90.68%)

5.44 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.9
/src/cogent3/core/alignment.py
1
from __future__ import annotations
6✔
2

3
import collections
6✔
4
import contextlib
6✔
5
import copy
6✔
6
import dataclasses
6✔
7
import hashlib
6✔
8
import json
6✔
9
import pathlib
6✔
10
import re
6✔
11
import types
6✔
12
import typing
6✔
13
import warnings
6✔
14
from abc import ABC, abstractmethod
6✔
15
from collections import Counter, defaultdict
6✔
16
from collections.abc import Callable, Iterable, Iterator, Mapping
6✔
17
from collections.abc import Sequence as PySeq
6✔
18
from functools import singledispatch, singledispatchmethod
6✔
19
from pathlib import Path
6✔
20
from typing import Optional
6✔
21

22
import numba
6✔
23
import numpy
6✔
24
from typing_extensions import Self
6✔
25

26
import cogent3
6✔
27
from cogent3._version import __version__
6✔
28
from cogent3.core import (
6✔
29
    alphabet as c3_alphabet,
30
)
31
from cogent3.core import (
6✔
32
    genetic_code as c3_genetic_code,
33
)
34
from cogent3.core import (
6✔
35
    moltype as c3_moltype,
36
)
37
from cogent3.core import (
6✔
38
    sequence as c3_sequence,
39
)
40
from cogent3.core.annotation import Feature
6✔
41
from cogent3.core.annotation_db import (
6✔
42
    AnnotatableMixin,
43
    FeatureDataType,
44
    SupportsFeatures,
45
)
46
from cogent3.core.info import Info as InfoClass
6✔
47
from cogent3.core.location import (
6✔
48
    FeatureMap,
49
    IndelMap,
50
    Strand,
51
)
52
from cogent3.core.profile import PSSM, MotifCountsArray, MotifFreqsArray, load_pssm
6✔
53
from cogent3.maths.stats.number import CategoryCounter
6✔
54
from cogent3.util import progress_display as UI
6✔
55
from cogent3.util.deserialise import register_deserialiser
6✔
56
from cogent3.util.dict_array import DictArray, DictArrayTemplate
6✔
57
from cogent3.util.io import atomic_write, get_format_suffixes
6✔
58
from cogent3.util.misc import (
6✔
59
    extend_docstring_from,
60
    get_object_provenance,
61
    get_setting_from_environ,
62
    negate_condition,
63
)
64
from cogent3.util.union_dict import UnionDict
6✔
65

66
# DESIGN NOTES
67
# the sequence data collections (SeqsDataABC and AlignedSeqsDataABC)
68
# have no concept of strand. All transformations with respect to strand
69
# are applied by the sequence record objects that have a .moltype
70
# attribute, i.e. Sequence and Aligned.
71
# both collections can indicate sequences that are reverse complemented
72
# by providing their names to the reversed_seqs argument.
73

74
if typing.TYPE_CHECKING:  # pragma: no cover
75
    from cogent3.core.tree import PhyloNode
76
    from cogent3.evolve.fast_distance import DistanceMatrix
77
    from cogent3.maths.stats.contingency import TestResult
78

79

80
OptInt = int | None
6✔
81
OptFloat = float | None
6✔
82
OptStr = str | None
6✔
83
OptList = list | None
6✔
84
OptIterStr = Iterable[str] | None
6✔
85
PySeqStr = PySeq[str]
6✔
86
OptPySeqStr = PySeqStr | None
6✔
87
OptDict = dict | None
6✔
88
OptBool = bool | None
6✔
89
OptSliceRecord = c3_sequence.SliceRecord | None
6✔
90
DictStrStr = dict[str, str]
6✔
91
DictStrInt = dict[str, int]
6✔
92
OptCallable = Callable | None
6✔
93
OptRenamerCallable = Callable[[str], str] | None
6✔
94
OptPathType = str | Path | None
6✔
95
StrORArray = str | numpy.ndarray[int]
6✔
96
StrORBytesORArray = str | bytes | numpy.ndarray[int]
6✔
97
StrORBytesORArrayOrSeq = str | bytes | numpy.ndarray[int] | c3_sequence.Sequence
6✔
98
MolTypes = c3_moltype.MolTypeLiteral | c3_moltype.MolType
6✔
99

100
# small number: 1-EPS is almost 1, and is used for things like the
101
# default number of gaps to allow in a column.
102
EPS = 1e-6
6✔
103

104

105
def array_hash64(data: numpy.ndarray) -> str:
6✔
106
    """returns 64-bit hash of numpy array.
107

108
    Notes
109
    -----
110
    This function does not introduce randomisation and so
111
    is reproducible between processes.
112
    """
113
    h = hashlib.md5(data.tobytes(), usedforsecurity=False)
6✔
114
    return h.hexdigest()
6✔
115

116

117
class _SeqNamer:
6✔
118
    def __init__(
6✔
119
        self,
120
        name_func: OptRenamerCallable = None,
121
        base_name: str = "seq",
122
        start_at: int = 0,
123
    ) -> None:
124
        self._base_name = base_name
6✔
125
        self._num = start_at
6✔
126
        self._name_func = name_func
6✔
127

128
    def __call__(self, seq: StrORBytesORArray, name: OptStr = None) -> str:
6✔
129
        name = name or getattr(seq, "name", name)
6✔
130

131
        if not name:
6✔
132
            name = f"{self._base_name}_{self._num}"
6✔
133
            self._num += 1
6✔
134
        elif self._name_func:
6✔
135
            name = self._name_func(name)
6✔
136

137
        return name
6✔
138

139

140
@numba.njit(cache=True)
141
def _gap_ok_vector_single(
142
    data: numpy.ndarray[numpy.uint8],
143
    gap_index: int,
144
    missing_index: int,
145
    num_allowed: int,
146
) -> bool:  # pragma: no cover
147
    """returns indicies for which the number of gaps & missing data is less than or equal to num_allowed"""
148
    num = 0
149
    for i in range(len(data)):
150
        if data[i] == gap_index or (
151
            missing_index is not None and data[i] == missing_index
152
        ):
153
            num += 1
154

155
        if num > num_allowed:
156
            break
157

158
    return num <= num_allowed
159

160

161
@numba.njit(cache=True)
162
def _gap_ok_vector_multi(
163
    motifs: numpy.ndarray,
164
    gap_index: int,
165
    missing_index: int,
166
    motif_length: int,
167
    num_allowed: int,
168
) -> numpy.ndarray[bool]:  # pragma: no cover
169
    """returns indicies for which the number of gaps & missing data in a vector of motifs is less than or equal to num_allowed"""
170
    num = 0
171
    for motif in motifs:
172
        for j in range(motif_length):
173
            if motif[j] == gap_index or (
174
                missing_index is not None and motif[j] == missing_index
175
            ):
176
                num += 1
177
                break
178

179
        if num > num_allowed:
180
            break
181

182
    return num <= num_allowed
183

184

185
class SeqDataView(c3_sequence.SeqView):
6✔
186
    """
187
    A view class for ``SeqsData``, providing methods for different
188
    representations of a single sequence.
189

190
    Notes
191
    -----
192
    ``str_value`` / ``array_value`` are not complemented, but can be reversed.
193
    The latter is done by the ``Sequence`` object which has a moltype.
194
    """
195

196
    __slots__ = ("_parent_len", "_seqid", "_slice_record", "alphabet", "parent")
6✔
197

198
    @property
6✔
199
    def offset(self) -> int:
6✔
200
        """the annotation offset of this view"""
201
        return self.slice_record.offset
6✔
202

203
    @property
6✔
204
    def str_value(self) -> str:
6✔
205
        """returns the sequence as a string"""
206
        return self.alphabet.from_indices(self.array_value)
6✔
207

208
    @property
6✔
209
    def array_value(self) -> numpy.ndarray:
6✔
210
        """returns the sequence as a numpy array"""
211
        # we select the data using plus strand coords
212
        raw = self.parent.get_seq_array(
6✔
213
            seqid=self.seqid,
214
            start=self.slice_record.plus_start,
215
            stop=self.slice_record.plus_stop,
216
            step=self.slice_record.plus_step,
217
        )
218
        if self.slice_record.is_reversed:
6✔
219
            # and reverse result when a reversed slice
220
            raw = raw[::-1]
6✔
221
        return raw
6✔
222

223
    @property
6✔
224
    def bytes_value(self) -> bytes:
6✔
225
        """returns the sequence as bytes"""
226
        return self.str_value.encode("utf8")
6✔
227

228
    def __repr__(self) -> str:
6✔
229
        seq = f"{self[:10]!s}...{self[-5:]}" if len(self) > 15 else str(self)
6✔
230
        return (
6✔
231
            f"{self.__class__.__name__}(seqid={self.seqid!r}, parent={seq}, "
232
            f"slice_record={self.slice_record!r})"
233
        )
234

235
    # refactor: design, do we support copy? do we support copy with sliced?
236
    def copy(self, sliced: bool = False) -> Self:
6✔
237
        """returns copy"""
238
        return self
6✔
239

240
    def to_rich_dict(self) -> dict[str, str | dict[str, str]]:
6✔
241
        """returns a json serialisable dict.
242

243
        Notes
244
        -----
245
        This method will slice the underlying sequence to the start and stop values
246

247
        Warnings
248
        --------
249
        This method is not intended to provide serialisation of this object,
250
        instead, it is intended for usage by an enclosing class.
251
        """
252

253
        data = {"type": get_object_provenance(self), "version": __version__}
6✔
254
        data["init_args"] = self._get_init_kwargs()
6✔
255

256
        if self.slice_record.is_reversed:
6✔
257
            adj = self.parent_len + 1
6✔
258
            start, stop = self.slice_record.stop + adj, self.slice_record.start + adj
6✔
259
        else:
260
            start, stop = self.slice_record.start, self.slice_record.stop
6✔
261

262
        data["init_args"]["parent"] = self.str_value[start:stop]
6✔
263
        new_sr = c3_sequence.SliceRecord(
6✔
264
            parent_len=(stop - start),
265
            step=self.slice_record.step,
266
            offset=self.slice_record.parent_start,
267
        )
268
        data["init_args"]["slice_record"] = new_sr.to_rich_dict()
6✔
269
        data["init_args"]["alphabet"] = self.alphabet.to_rich_dict()
6✔
270
        return data
6✔
271

272
    @property
6✔
273
    def is_reversed(self) -> bool:
6✔
274
        if self.seqid in self.parent.reversed_seqs:
6✔
275
            # seqid is reversed relative to everything else
276
            # hence is_reversed is the opposite of the slice record
277
            return not self.slice_record.is_reversed
6✔
278
        return self.slice_record.is_reversed
6✔
279

280
    def parent_coords(
6✔
281
        self, *, apply_offset: bool = False, **kwargs
282
    ) -> tuple[str, int, int, int]:
283
        """returns coordinates on parent
284

285
        Parameters
286
        ----------
287
        apply_offset
288
            if True adds annotation offset from parent
289

290
        Returns
291
        -------
292
        parent seqid, start, stop, strand
293
        """
294
        offset = self.parent_offset if apply_offset else 0
6✔
295
        start = self.slice_record.parent_start
6✔
296
        stop = self.slice_record.parent_stop
6✔
297
        step = self.slice_record.step
6✔
298
        return self.seqid, start + offset, stop + offset, step
6✔
299

300

301
class SeqsDataABC(ABC):
6✔
302
    """Abstract base class for respresenting the storage object for sequences underlying
303
    a SequenceCollection.
304
    """
305

306
    __slots__ = ()
6✔
307

308
    @abstractmethod
309
    def __init__(
310
        self,
311
        *,
312
        data: dict[str, StrORBytesORArray],
313
        alphabet: c3_alphabet.AlphabetABC,
314
        offset: dict[str, int] | None = None,
315
        check: bool = True,
316
        reversed_seqs: set[str] | None = None,
317
    ) -> None: ...
318

319
    @classmethod
320
    @abstractmethod
321
    def from_seqs(
322
        cls,
323
        *,
324
        data: dict[str, StrORBytesORArray],
325
        alphabet: c3_alphabet.AlphabetABC,
326
        **kwargs,
327
    ): ...
328

329
    @abstractmethod
330
    def __eq__(self, value: object) -> bool: ...
331

332
    @abstractmethod
333
    def __ne__(self, value: object) -> bool: ...
334

335
    @abstractmethod
336
    def get_seq_length(self, seqid: str) -> int: ...
337

338
    @property
339
    @abstractmethod
340
    def reversed_seqs(self) -> frozenset[str]: ...
341

342
    @property
343
    @abstractmethod
344
    def names(self) -> tuple[str, ...]: ...
345

346
    @property
347
    @abstractmethod
348
    def alphabet(self) -> c3_alphabet.CharAlphabet: ...
349

350
    @property
351
    @abstractmethod
352
    def offset(self) -> dict[str, int]: ...
353

354
    @abstractmethod
355
    def get_seq_array(
356
        self,
357
        *,
358
        seqid: str,
359
        start: OptInt = None,
360
        stop: OptInt = None,
361
    ) -> numpy.ndarray: ...
362

363
    @abstractmethod
364
    def get_seq_str(
365
        self,
366
        *,
367
        seqid: str,
368
        start: OptInt = None,
369
        stop: OptInt = None,
370
    ) -> str: ...
371

372
    @abstractmethod
373
    def get_seq_bytes(
374
        self,
375
        *,
376
        seqid: str,
377
        start: OptInt = None,
378
        stop: OptInt = None,
379
    ) -> bytes: ...
380

381
    @abstractmethod
382
    def get_view(self, seqid: str) -> c3_sequence.SeqViewABC: ...
383

384
    @abstractmethod
385
    def to_alphabet(self, alphabet: c3_alphabet.AlphabetABC) -> SeqsDataABC: ...
386

387
    @abstractmethod
388
    def add_seqs(self, seqs, **kwargs) -> SeqsDataABC: ...
389

390
    @abstractmethod
391
    def __len__(self) -> int: ...
392

393
    @abstractmethod
394
    def __getitem__(
395
        self,
396
        index: str | int,
397
    ) -> c3_sequence.Sequence | c3_sequence.SeqViewABC: ...
398

399
    @abstractmethod
400
    def copy(self, **kwargs) -> SeqsDataABC: ...
401

402
    @abstractmethod
403
    def get_hash(self, seqid: str) -> str | None: ...
404

405

406
class SeqsData(SeqsDataABC):
6✔
407
    """The builtin ``cogent3`` implementation of sequence storage underlying
408
    a ``SequenceCollection``. The sequence data is stored as numpy arrays. Indexing
409
    this object (using an int or seq name) returns a ``SeqDataView``, which can realise
410
    the corresponding slice as a string, bytes, or numpy array via the alphabet.
411

412
    Notes
413
    -----
414
    Methods on this object only accepts plust strand start, stop and step
415
    indices for selecting segments of data. It can return the gap coordinates
416
    for a sequence as used by IndelMap.
417
    """
418

419
    __slots__ = ("_alphabet", "_data", "_hashes", "_offset", "_reversed")
6✔
420

421
    def __init__(
6✔
422
        self,
423
        *,
424
        data: dict[str, StrORBytesORArray],
425
        alphabet: c3_alphabet.CharAlphabet,
426
        offset: dict[str, int] | None = None,
427
        check: bool = True,
428
        reversed_seqs: set[str] | None = None,
429
    ) -> None:
430
        """
431
        Parameters
432
        ----------
433
        data
434
            raw data as {seq name: sequence, ...} where the sequence can be converted
435
            to a numpy array using the provided alphabet.
436
        alphabet
437
            a cogent3 CharAlphabet instance, typically defined as
438
            <moltype>.most_degen_alphabet()
439
        offset
440
            dict indicating annotation offsets for each sequence
441
        check
442
            use the alphabet to check the sequences are valid
443
        reversed_seqs
444
            names of seqs that are reverse complemented
445

446
        Raises
447
        ------
448
        AlphabetError if the check fails
449
        """
450
        self._alphabet = alphabet
6✔
451
        self._offset = offset or {}
6✔
452
        self._reversed = frozenset(reversed_seqs or set())
6✔
453
        if check:
6✔
454
            assert self._offset.keys() <= data.keys(), (
6✔
455
                "sequence name provided in offset not found in data"
456
            )
457
            if any(not alphabet.is_valid(seq) for seq in data.values()):
6✔
458
                msg = f"One or more sequences are invalid for alphabet {alphabet}"
6✔
459
                raise c3_alphabet.AlphabetError(
6✔
460
                    msg,
461
                )
462
        self._data: dict[str, numpy.ndarray] = {}
6✔
463
        self._hashes: dict[str, str] = {}
6✔
464
        for name, seq in data.items():
6✔
465
            arr = self._alphabet.to_indices(seq)
6✔
466
            self._hashes[name] = array_hash64(arr)
6✔
467
            arr.flags.writeable = False
6✔
468
            self._data[str(name)] = arr
6✔
469

470
    def __eq__(self, other: SeqsDataABC) -> bool:
6✔
471
        if not isinstance(other, self.__class__):
6✔
472
            return False
×
473
        for attr_name in ("_alphabet", "_offset"):
6✔
474
            self_attr = getattr(self, attr_name)
6✔
475
            other_attr = getattr(other, attr_name)
6✔
476
            if self_attr != other_attr:
6✔
477
                return False
6✔
478

479
        # compare individuals sequences
480
        if self._data.keys() != other._data.keys():
6✔
481
            return False
×
482
        return all(
6✔
483
            numpy.array_equal(self._data[name], other._data[name])
484
            for name in self._data
485
        )
486

487
    def __ne__(self, other: object) -> bool:
6✔
488
        return not self == other
6✔
489

490
    @classmethod
6✔
491
    def from_seqs(
6✔
492
        cls,
493
        *,
494
        data: dict[str, StrORBytesORArray],
495
        alphabet: c3_alphabet.AlphabetABC,
496
        **kwargs,
497
    ):
498
        return cls(data=data, alphabet=alphabet, **kwargs)
6✔
499

500
    @property
6✔
501
    def names(self) -> tuple[str, ...]:
6✔
502
        """returns the names of the sequences in the storage"""
503
        return tuple(self._data.keys())
6✔
504

505
    @property
6✔
506
    def reversed_seqs(self) -> frozenset[str]:
6✔
507
        """names of sequences that are reverse complemented"""
508
        return self._reversed
6✔
509

510
    @property
6✔
511
    def alphabet(self) -> c3_alphabet.CharAlphabet:
6✔
512
        """the character alphabet for validating, encoding, decoding sequences"""
513
        return self._alphabet
6✔
514

515
    @property
6✔
516
    def offset(self) -> dict[str, int]:
6✔
517
        """annotation offsets for each sequence"""
518
        return {name: self._offset.get(name, 0) for name in self.names}
6✔
519

520
    def get_seq_length(self, seqid: str) -> int:
6✔
521
        """return length for seqid"""
522
        return self._data[seqid].shape[0]
6✔
523

524
    def get_seq_array(
6✔
525
        self,
526
        *,
527
        seqid: str,
528
        start: OptInt = None,
529
        stop: OptInt = None,
530
        step: OptInt = None,
531
    ) -> numpy.ndarray:
532
        start = start or 0
6✔
533
        stop = stop if stop is not None else self.get_seq_length(seqid)
6✔
534
        step = step or 1
6✔
535

536
        if start < 0 or stop < 0 or step < 1:
6✔
537
            msg = f"{start=}, {stop=}, {step=} not >= 1"
×
538
            raise ValueError(msg)
×
539

540
        out_len = (stop - start + step - 1) // step
6✔
541
        out = numpy.empty(out_len, dtype=self.alphabet.dtype)
6✔
542

543
        out[:] = self._data[seqid][start:stop:step]
6✔
544
        return out
6✔
545

546
    def get_seq_str(
6✔
547
        self,
548
        *,
549
        seqid: str,
550
        start: OptInt = None,
551
        stop: OptInt = None,
552
        step: OptInt = None,
553
    ) -> str:
554
        return self._alphabet.from_indices(
6✔
555
            self.get_seq_array(seqid=seqid, start=start, stop=stop, step=step),
556
        )
557

558
    def get_seq_bytes(
6✔
559
        self,
560
        *,
561
        seqid: str,
562
        start: OptInt = None,
563
        stop: OptInt = None,
564
        step: OptInt = None,
565
    ) -> bytes:
566
        return self.get_seq_str(seqid=seqid, start=start, stop=stop, step=step).encode(
6✔
567
            "utf8",
568
        )
569

570
    def get_view(self, seqid: str) -> SeqDataView:
6✔
571
        """reurns view of sequence data for seqid"""
572
        seq_len = len(self._data[seqid])
6✔
573
        return SeqDataView(
6✔
574
            parent=self,
575
            seqid=seqid,
576
            parent_len=seq_len,
577
            alphabet=self.alphabet,
578
        )
579

580
    def add_seqs(
6✔
581
        self,
582
        seqs: dict[str, StrORBytesORArray],
583
        force_unique_keys: bool = True,
584
        offset: dict[str, int] | None = None,
585
    ) -> SeqsData:
586
        """Returns a new SeqsData object with added sequences. If force_unique_keys
587
        is True, raises ValueError if any names already exist in the collection."""
588
        if force_unique_keys and any(name in self.names for name in seqs):
6✔
589
            msg = "One or more sequence names already exist in collection"
6✔
590
            raise ValueError(msg)
6✔
591
        new_data = {
6✔
592
            **self._data,
593
            **{name: self.alphabet.to_indices(seq) for name, seq in seqs.items()},
594
        }
595
        return self.copy(
6✔
596
            data=new_data,
597
            alphabet=self.alphabet,
598
            offset={**self._offset, **(offset or {})},
599
        )
600

601
    def to_alphabet(
6✔
602
        self,
603
        alphabet: c3_alphabet.AlphabetABC,
604
        check_valid: bool = True,
605
    ) -> SeqsData:
606
        if (
6✔
607
            len(self.alphabet) == len(alphabet)
608
            and len(
609
                {
610
                    (a, b)
611
                    for a, b in zip(self.alphabet, alphabet, strict=False)
612
                    if a != b
613
                },
614
            )
615
            == 1
616
        ):
617
            # rna <-> dna swap just replace alphabet
618
            return self.copy(alphabet=alphabet)
6✔
619

620
        new_data = {}
6✔
621
        for seqid in self.names:
6✔
622
            seq_data = self.get_seq_array(seqid=seqid)
6✔
623
            as_new_alpha = self.alphabet.convert_seq_array_to(
6✔
624
                seq=seq_data,
625
                alphabet=alphabet,
626
                check_valid=check_valid,
627
            )
628
            new_data[seqid] = as_new_alpha
6✔
629

630
        return self.copy(
6✔
631
            data=new_data,
632
            alphabet=alphabet,
633
            check=False,
634
        )
635

636
    def __len__(self) -> int:
6✔
637
        return len(self.names)
×
638

639
    @singledispatchmethod
6✔
640
    def __getitem__(self, index: str | int) -> c3_sequence.SeqViewABC:
6✔
641
        msg = f"__getitem__ not implemented for {type(index)}"
6✔
642
        raise NotImplementedError(msg)
6✔
643

644
    @__getitem__.register
6✔
645
    def _(self, index: str) -> c3_sequence.SeqViewABC:
6✔
646
        return self.get_view(seqid=index)
6✔
647

648
    @__getitem__.register
6✔
649
    def _(self, index: int) -> c3_sequence.SeqViewABC:
6✔
650
        return self[self.names[index]]
6✔
651

652
    def copy(self, **kwargs) -> Self:
6✔
653
        """shallow copy of self
654

655
        Notes
656
        -----
657
        kwargs are passed to constructor and will over-ride existing values
658
        """
659
        init_args = {
6✔
660
            "data": self._data,
661
            "alphabet": self._alphabet,
662
            "offset": self._offset,
663
            "reversed_seqs": self._reversed,
664
            **kwargs,
665
        }
666
        return self.__class__(**init_args)
6✔
667

668
    def get_hash(self, seqid: str) -> str | None:
6✔
669
        """returns hash of seqid"""
670
        return self._hashes.get(seqid)
6✔
671

672

673
class SequenceCollection(AnnotatableMixin):
6✔
674
    """A container of unaligned sequences.
675

676
    Notes
677
    -----
678
    Should be constructed using ``make_unaligned_seqs()``.
679
    """
680

681
    def __init__(
6✔
682
        self,
683
        *,
684
        seqs_data: SeqsDataABC,
685
        moltype: c3_moltype.MolType,
686
        info: dict | InfoClass | None = None,
687
        source: OptPathType = None,
688
        annotation_db: SupportsFeatures | None = None,
689
        name_map: OptDict = None,
690
        is_reversed: bool = False,
691
    ) -> None:
692
        """
693
        Parameters
694
        ----------
695
        seqs_data
696
            a SeqsDataABC instance containg the sequence data
697
        moltype
698
            the molecular type of the sequences
699
        info
700
            additional information about the collection
701
        source
702
            the source of the sequence data
703
        annotation_db
704
            a database of annotations for the sequences
705
        name_map
706
            map between the names specified in the collection and names used in
707
            the underlying seqs_data. Used for when the names have been changed,
708
            but we want to query for annotations using the original names.
709
        is_reversed
710
            entire collection is reverse complemented
711
        """
712
        self._seqs_data = seqs_data
6✔
713
        self.moltype = moltype
6✔
714
        name_map = name_map or {name: name for name in seqs_data.names}
6✔
715
        self._name_map = types.MappingProxyType(name_map)
6✔
716
        self._is_reversed = is_reversed
6✔
717
        if not isinstance(info, InfoClass):
6✔
718
            info = InfoClass(info) if info else InfoClass()
6✔
719
        self.info = info
6✔
720
        self.source = source
6✔
721
        self._repr_policy = {
6✔
722
            "num_seqs": 10,
723
            "num_pos": 60,
724
            "ref_name": "longest",
725
            "wrap": 60,
726
        }
727
        self._annotation_db: list[SupportsFeatures] = self._init_annot_db_value(
6✔
728
            annotation_db
729
        )
730
        self._seqs = None
6✔
731
        self._post_init()
6✔
732

733
    def _post_init(self) -> None:
6✔
734
        # override in subclasses
735
        self._seqs = _IndexableSeqs(self, make_seq=self._make_seq)
6✔
736

737
    def __getstate__(self) -> dict:
6✔
738
        return self._get_init_kwargs()
6✔
739

740
    def __setstate__(self, state: dict) -> None:
6✔
741
        state["name_map"] = types.MappingProxyType(state.pop("name_map"))
6✔
742
        obj = self.__class__(**state)
6✔
743

744
        self.__dict__.update(obj.__dict__)
6✔
745

746
    def _make_seq(self, name: str) -> c3_sequence.Sequence:
6✔
747
        # seqview is given the name of the parent (if different from the current name)
748
        # the sequence is given the current name
749
        seqid = self._name_map.get(name, name)
6✔
750
        sv = self._seqs_data.get_view(seqid)
6✔
751
        if self._is_reversed:
6✔
752
            sv = sv[::-1]
6✔
753
        return self.moltype.make_seq(
6✔
754
            seq=sv,
755
            name=name,
756
            annotation_db=self._annotation_db,
757
        )
758

759
    def _get_init_kwargs(self) -> dict:
6✔
760
        """dict of all the arguments needed to initialise a new instance"""
761
        # both SequenceCollection and Alignment implement _get_init_kwargs,
762
        # ensuring methods in SequenceCollection that are inherited by Alignment
763
        # capture initialisation arguments unique to the subclass.
764
        # mutable arguments are copied
765
        return {
6✔
766
            "seqs_data": self._seqs_data,
767
            "moltype": self.moltype,
768
            "name_map": dict(self._name_map),
769
            "info": self.info.copy(),
770
            "annotation_db": self._annotation_db,
771
            "source": self.source,
772
            "is_reversed": self._is_reversed,
773
        }
774

775
    @property
6✔
776
    def storage(self) -> SeqsDataABC:
6✔
777
        """the unaligned sequence storage instance of the collection"""
778
        return self._seqs_data
6✔
779

780
    @storage.setter
6✔
781
    def storage(self, value: object) -> None:
6✔
782
        # storage cannot be set after initialisation
783
        msg = "storage cannot be set after initialisation"
6✔
784
        raise TypeError(msg)
6✔
785

786
    @property
6✔
787
    def modified(self) -> bool:
6✔
788
        """collection is a modification of underlying storage"""
789
        return any(
6✔
790
            [
791
                self._is_reversed,
792
                set(self.name_map.values()) != set(self.storage.names),
793
                self.name_map.keys() != set(self.name_map.values()),
794
            ]
795
        )
796

797
    @property
6✔
798
    def seqs(self) -> _IndexableSeqs:
6✔
799
        """iterable of sequences in the collection
800

801
        Notes
802
        -----
803
        Can be indexed by a sequence name or integer index.
804
        Cannot be sliced.
805

806
        Returns
807
        -------
808
        Instance of ``MolType`` sequence or ``Aligned`` sequence if
809
        self is an ``Alignment``.
810
        """
811
        return self._seqs
6✔
812

813
    @property
6✔
814
    def names(self) -> tuple[str, ...]:
6✔
815
        """returns the names of the sequences in the collection"""
816
        return tuple(self._name_map.keys())
6✔
817

818
    @property
6✔
819
    def name_map(self) -> types.MappingProxyType:
6✔
820
        """returns mapping of seq names to parent seq names
821

822
        Notes
823
        -----
824
        The underlying SeqsData may have different names for the same
825
        sequences. This object maps the names of sequences in self to
826
        the names of sequences in SeqsData.
827
        MappingProxyType is an immutable mapping, so it cannot be
828
        changed. Use self.rename_seqs() to do that.
829
        """
830
        return self._name_map
6✔
831

832
    @name_map.setter
6✔
833
    def name_map(self, value: OptDict) -> None:  # noqa: ARG002
6✔
834
        """name_map can only be set at initialisation"""
835
        msg = "name_map cannot be set after initialisation"
6✔
836
        raise TypeError(msg)
6✔
837

838
    @property
6✔
839
    def num_seqs(self) -> int:
6✔
840
        """the number of sequences in the collection"""
841
        return len(self.names)
6✔
842

843
    def iter_seqs(
6✔
844
        self,
845
        seq_order: OptList = None,
846
    ) -> Iterator[c3_sequence.Sequence | c3_sequence.SeqViewABC]:
847
        """Iterates over sequences in the collection, in order.
848

849
        Parameters
850
        ----------
851
        seq_order:
852
            list of seqids giving the order in which seqs will be returned.
853
            Defaults to self.names
854
        """
855
        if seq_order is None:
6✔
856
            yield from self.seqs
6✔
857
        else:
858
            for name in seq_order:
6✔
859
                yield self.seqs[name]
6✔
860

861
    def take_seqs(
6✔
862
        self,
863
        names: str | typing.Sequence[str],
864
        negate: bool = False,
865
        copy_annotations: bool = False,
866
        **kwargs,
867
    ) -> Self:
868
        """Returns new collection containing only specified seqs.
869

870
        Parameters
871
        ----------
872
        names
873
            sequences to select (or exclude if negate=True)
874
        negate
875
            select all sequences EXCEPT names
876
        kwargs
877
            keyword arguments to be passed to the constructor of the new collection
878
        copy_annotations
879
            if True, only annotations from selected seqs are copied to the annotation_db
880
            of the new collection
881
        """
882

883
        # to return a new collection with a subset of the sequences we dont
884
        # want to modify the underlying data, instead we create a new collection
885
        # with a subset of the names, recorded in the name_map dict.
886

887
        # refactor: design, reimplement on Alignment. on which, if self.array_seqs
888
        # defined, assign result of self._array_seqs.take(subset_name_indices) to
889
        # resulting alignments _array_seqs attribute
890

891
        if isinstance(names, str):
6✔
892
            names = [names]
6✔
893

894
        if negate:
6✔
895
            names = [name for name in self.names if name not in names]
6✔
896

897
        if not names:
6✔
898
            msg = f"{names=} and {negate=} resulted in no names"
6✔
899
            raise ValueError(msg)
6✔
900

901
        if diff := set(names) - set(self.names):
6✔
902
            msg = f"The following provided names not found in collection: {diff}"
6✔
903
            raise ValueError(msg)
6✔
904

905
        selected_name_map = {name: self._name_map[name] for name in names}
6✔
906

907
        init_kwargs = self._get_init_kwargs()
6✔
908
        init_kwargs["name_map"] = selected_name_map
6✔
909
        if self._annotation_db:
6✔
910
            if copy_annotations:
6✔
911
                ann_db = type(self.annotation_db)()
6✔
912
                ann_db.update(
6✔
913
                    annot_db=self.annotation_db,
914
                    seqids=list(selected_name_map),
915
                )
916
            else:
917
                ann_db = self.annotation_db
6✔
918
            init_kwargs["annotation_db"] = ann_db
6✔
919

920
        return self.__class__(**init_kwargs)
6✔
921

922
    def get_seq_names_if(
6✔
923
        self,
924
        f: Callable[[c3_sequence.Sequence], bool],
925
        negate: bool = False,
926
    ) -> list[str]:
927
        """Returns list of names of seqs where f(seq) is True.
928

929
        Parameters
930
        ----------
931
        f
932
            function that takes a sequence object and returns True or False
933
        negate
934
            select all sequences EXCEPT those where f(seq) is True
935

936
        Notes
937
        -----
938
        Sequence objects can be converted into strings or numpy arrays using
939
        str() and numpy.array() respectively.
940
        """
941
        get = self.seqs
6✔
942

943
        new_f = negate_condition(f) if negate else f
6✔
944

945
        return [name for name in self.names if new_f(get[name])]
6✔
946

947
    def take_seqs_if(
6✔
948
        self,
949
        f: Callable[[c3_sequence.Sequence], bool],
950
        negate: bool = False,
951
    ) -> Self:
952
        """Returns new collection containing seqs where f(seq) is True.
953

954
        Parameters
955
        ----------
956
        f
957
            function that takes a sequence object and returns True or False
958
        negate
959
            select all sequences EXCEPT those where f(seq) is True
960

961
        Notes
962
        -----
963
        Sequence objects can be converted into strings or numpy arrays using
964
        str() and numpy.array() respectively.
965
        """
966
        return self.take_seqs(self.get_seq_names_if(f, negate))
6✔
967

968
    def get_seq(
6✔
969
        self,
970
        seqname: str,
971
        copy_annotations: bool = False,
972
    ) -> c3_sequence.Sequence:
973
        """Return a Sequence object for the specified seqname.
974

975
        Parameters
976
        ----------
977
        seqname
978
            name of the sequence to return
979
        copy_annotations
980
            if True, only the annotations for the specified sequence are copied
981
            to the annotation database of the Sequence object which is decoupled
982
            from this collection. If False, the connection to this collections db
983
            is retained.
984
        """
985
        seq = self.seqs[seqname]
6✔
986
        if copy_annotations and self._annotation_db:
6✔
987
            # we need to copy the sequence too to break the link to self.annotation_db
988
            seq = seq.copy(exclude_annotations=True)
×
989
            seq.annotation_db = type(self.annotation_db)()
×
990
            seq.annotation_db.update(annot_db=self.annotation_db, seqids=seqname)
×
991
            return seq
×
992

993
        seq.annotation_db = self._annotation_db
6✔
994

995
        return seq
6✔
996

997
    def add_seqs(
6✔
998
        self,
999
        seqs: dict[str, StrORBytesORArray] | SeqsData | list,
1000
        **kwargs,
1001
    ) -> Self:
1002
        """Returns new collection with additional sequences.
1003

1004
        Parameters
1005
        ----------
1006
        seqs
1007
            sequences to add
1008
        """
1009
        assign_names = _SeqNamer()
6✔
1010
        seqs = _make_name_seq_mapping(seqs, assign_names)
6✔
1011
        name_map = make_name_map(seqs)
6✔
1012
        data, offsets, _ = prep_for_seqs_data(
6✔
1013
            seqs,
1014
            self.moltype,
1015
            assign_names,
1016
        )
1017

1018
        if not name_map:
6✔
1019
            name_map = dict(zip(data, data, strict=False))
6✔
1020

1021
        kwargs["offset"] = offsets
6✔
1022
        seqs_data = self._seqs_data.add_seqs(data, **kwargs)
6✔
1023
        return self.__class__(
6✔
1024
            seqs_data=seqs_data,
1025
            moltype=self.moltype,
1026
            name_map={**self._name_map, **name_map},
1027
            info=self.info,
1028
            source=self.source,
1029
            annotation_db=self._annotation_db,
1030
        )
1031

1032
    def rename_seqs(self, renamer: Callable[[str], str]) -> Self:
6✔
1033
        """Returns new collection with renamed sequences.
1034

1035
        Parameters
1036
        ----------
1037
        renamer
1038
            callable that takes a name string and returns a string
1039

1040
        Raises
1041
        ------
1042
        ValueError if renamer produces duplicate names.
1043

1044
        Notes
1045
        -----
1046
        The resulting object stores the mapping of new to old names in
1047
        self.name_map.
1048
        """
1049
        new_name_map = {}
6✔
1050
        for name, parent_name in self._name_map.items():
6✔
1051
            new_name = renamer(name)
6✔
1052
            # we retain the parent_name when it differs from the name,
1053
            # this can happen after multiple renames on the same collection
1054
            parent_name = parent_name if name != parent_name else name  # noqa: PLW2901
6✔
1055
            new_name_map[new_name] = parent_name
6✔
1056

1057
        if len(new_name_map) != len(self._name_map):
6✔
1058
            msg = f"non-unique names produced by {renamer=}"
6✔
1059
            raise ValueError(msg)
6✔
1060

1061
        init_args = self._get_init_kwargs()
6✔
1062
        init_args["name_map"] = new_name_map
6✔
1063

1064
        return self.__class__(**init_args)
6✔
1065

1066
    def to_dict(self, as_array: bool = False) -> dict[str, str | numpy.ndarray]:
6✔
1067
        """Return a dictionary of sequences.
1068

1069
        Parameters
1070
        ----------
1071
        as_array
1072
            if True, sequences are returned as numpy arrays, otherwise as strings
1073
        """
1074
        return {s.name: (numpy.array(s) if as_array else str(s)) for s in self.seqs}
6✔
1075

1076
    def to_rich_dict(self) -> dict[str, str | dict[str, str]]:
6✔
1077
        """returns a json serialisable dict
1078

1079
        Notes
1080
        -----
1081
        Deserialisation the object produced by this method will not include the
1082
        annotation_db if present.
1083
        """
1084
        kwargs = self._get_init_kwargs()
6✔
1085
        kwargs.pop("is_reversed", None)  # reversal is realised
6✔
1086
        kwargs["moltype"] = self.moltype.label
6✔
1087
        kwargs.pop("annotation_db", None)
6✔
1088
        kwargs.pop(
6✔
1089
            "offset",
1090
            None,
1091
        )  # no need for offset if we dont have an annotation_db
1092
        kwargs.pop("seqs_data", None)  # we serialise the seqs_data directly
6✔
1093

1094
        data = {
6✔
1095
            "init_args": kwargs,
1096
            "type": get_object_provenance(self),
1097
            "version": __version__,
1098
        }
1099
        data["seqs"] = {self._name_map[s.name]: str(s) for s in self.seqs}
6✔
1100

1101
        return data
6✔
1102

1103
    @classmethod
6✔
1104
    def from_rich_dict(
6✔
1105
        cls,
1106
        data: dict[str, str | dict[str, str]],
1107
    ) -> SequenceCollection:
1108
        """returns a new instance from a rich dict"""
1109
        return make_unaligned_seqs(data["seqs"], **data["init_args"])
6✔
1110

1111
    def to_json(self) -> str:
6✔
1112
        """returns json formatted string"""
1113
        return json.dumps(self.to_rich_dict())
6✔
1114

1115
    def degap(self, storage_backend: str | None = None, **kwargs) -> SequenceCollection:
6✔
1116
        """returns collection sequences without gaps or missing characters.
1117

1118
        Parameters
1119
        ----------
1120
        storage_backend
1121
            name of the storage backend to use for the SeqsData object, defaults to
1122
            cogent3 builtin.
1123
        kwargs
1124
            keyword arguments for the storage driver
1125

1126
        Notes
1127
        -----
1128
        The returned collection will not retain an annotation_db if present.
1129
        """
1130
        if storage_backend:
6✔
1131
            make_storage = cogent3._plugin.get_unaligned_storage_driver(  # noqa: SLF001
6✔
1132
                storage_backend,
1133
            ).from_seqs
1134
        else:
1135
            make_storage = self._seqs_data.from_seqs
6✔
1136
        data = {}
6✔
1137
        for name in self.names:
6✔
1138
            # because we are in a SequenceCollection, which cannot be sliced, so
1139
            # we can just interrogate the bound _seqs_data directly.
1140
            seq = self._seqs_data.get_seq_array(seqid=self._name_map.get(name, name))
6✔
1141
            data[self.name_map[name]] = self.moltype.degap(seq)
6✔
1142

1143
        init_kwargs = self._get_init_kwargs()
6✔
1144
        init_kwargs["seqs_data"] = make_storage(
6✔
1145
            data=data,
1146
            alphabet=self._seqs_data.alphabet,
1147
            check=False,
1148
            **kwargs,
1149
        )
1150

1151
        return self.__class__(**init_kwargs)
6✔
1152

1153
    def to_moltype(self, moltype: MolTypes) -> Self:
6✔
1154
        """returns copy of self with changed moltype
1155

1156
        Parameters
1157
        ----------
1158
        moltype
1159
            name of the new moltype, e.g, 'dna', 'rna'.
1160

1161
        Notes
1162
        -----
1163
        Cannot convert from nucleic acids to proteins. Use get_translation() for that.
1164

1165
        """
1166
        mtype = c3_moltype.get_moltype(moltype)
6✔
1167
        if mtype is self.moltype:
6✔
1168
            return self  # nothing to be done
6✔
1169

1170
        alpha = mtype.most_degen_alphabet()
6✔
1171
        try:
6✔
1172
            new_seqs_data = self._seqs_data.to_alphabet(alpha)
6✔
1173
        except c3_moltype.MolTypeError as e:
6✔
1174
            msg = f"Failed to convert moltype from {self.moltype.label} to {moltype}"
×
1175
            raise c3_moltype.MolTypeError(
×
1176
                msg,
1177
            ) from e
1178

1179
        init_kwargs = self._get_init_kwargs()
6✔
1180
        init_kwargs["seqs_data"] = new_seqs_data
6✔
1181
        init_kwargs["moltype"] = mtype
6✔
1182

1183
        return self.__class__(**init_kwargs)
6✔
1184

1185
    def to_dna(self) -> Self:
6✔
1186
        """returns copy of self as a collection of DNA moltype seqs"""
1187
        return self.to_moltype("dna")
6✔
1188

1189
    def to_rna(self) -> Self:
6✔
1190
        """returns copy of self as a collection of RNA moltype seqs"""
1191
        return self.to_moltype("rna")
6✔
1192

1193
    def get_translation(
6✔
1194
        self,
1195
        gc: c3_genetic_code.GeneticCodeChoiceType = 1,
1196
        incomplete_ok: bool = False,
1197
        include_stop: bool = False,
1198
        trim_stop: bool = True,
1199
        **kwargs,
1200
    ) -> Self:
1201
        """translate sequences from nucleic acid to protein
1202

1203
        Parameters
1204
        ----------
1205
        gc
1206
            genetic code, either the number or name
1207
            (use cogent3.core.genetic_code.available_codes)
1208
        incomplete_ok
1209
            codons that are mixes of nucleotide and gaps converted to '?'.
1210
            raises a ValueError if False
1211
        include_stop
1212
            whether to allow a stops in the translated sequence
1213
        trim_stop
1214
            exclude terminal stop codons if they exist
1215
        kwargs
1216
            related to construction of the resulting object
1217

1218
        Returns
1219
        -------
1220
        A new instance of self translated into protein
1221

1222
        Notes
1223
        -----
1224
        Translating will break the relationship to an annotation_db if present.
1225
        """
1226
        if not self.moltype.is_nucleic:
6✔
1227
            msg = f"moltype must be a DNA/RNA, not {self.moltype.name!r}"
6✔
1228
            raise c3_moltype.MolTypeError(
6✔
1229
                msg,
1230
            )
1231

1232
        translated = {}
6✔
1233
        for seq in self.seqs:
6✔
1234
            pep = seq.get_translation(
6✔
1235
                gc,
1236
                incomplete_ok=incomplete_ok,
1237
                include_stop=include_stop,
1238
                trim_stop=trim_stop,
1239
            )
1240
            translated[self.name_map[seq.name]] = numpy.array(pep)
6✔
1241

1242
        pep_moltype = c3_moltype.get_moltype(
6✔
1243
            "protein_with_stop" if include_stop else "protein",
1244
        )
1245
        seqs_data = self._seqs_data.from_seqs(
6✔
1246
            data=translated,
1247
            alphabet=pep_moltype.most_degen_alphabet(),
1248
        )
1249
        return self.__class__(
6✔
1250
            seqs_data=seqs_data,
1251
            moltype=pep_moltype,
1252
            name_map=self._name_map,
1253
            info=self.info,
1254
            source=self.source,
1255
            **kwargs,
1256
        )
1257

1258
    def rc(self) -> Self:
6✔
1259
        """Returns the reverse complement of all sequences in the collection.
1260
        A synonym for reverse_complement.
1261
        """
1262
        init_kwargs = self._get_init_kwargs()
6✔
1263
        init_kwargs["is_reversed"] = not self._is_reversed
6✔
1264
        return self.__class__(**init_kwargs)
6✔
1265

1266
    def reverse_complement(self) -> Self:
6✔
1267
        """Returns the reverse complement of all sequences in the collection.
1268
        A synonym for rc.
1269
        """
1270
        return self.rc()
6✔
1271

1272
    def distance_matrix(self, calc: str = "pdist") -> DistanceMatrix:
6✔
1273
        """Estimated pairwise distance between sequences
1274

1275
        Parameters
1276
        ----------
1277
        calc
1278
            The distance calculation method to use, either "pdist" or "jc69".
1279
            - "pdist" is an approximation of the proportion sites different.
1280
            - "jc69" is an approximation of the Jukes Cantor distance.
1281

1282
        Returns
1283
        -------
1284
        DistanceMatrix
1285
            Estimated pairwise distances between sequences in the collection
1286

1287
        Notes
1288
        -----
1289
        pdist approximates the proportion sites different from the Jaccard
1290
        distance. Coefficients for the approximation were derived from a
1291
        polynomial fit between Jaccard distance of kmers with k=10 and the
1292
        proportion of sites different using mammalian 106 protein coding
1293
        gene DNA sequence alignments.
1294

1295
        jc69 approximates the Jukes Cantor distance using the approximated
1296
        proportion sites different, i.e., a transformation of the above.
1297
        """
1298
        from cogent3.app.dist import get_approx_dist_calc
6✔
1299

1300
        # check moltype
1301
        if len(self.moltype.alphabet) != 4:
6✔
1302
            msg = "only defined for DNA/RNA molecular types"
6✔
1303
            raise NotImplementedError(msg)
6✔
1304

1305
        # assert we have more than one sequence in the SequenceCollection
1306
        if self.num_seqs == 1:
6✔
1307
            msg = (
6✔
1308
                "Pairwise distance cannot be computed for a single sequence. "
1309
                "Please provide at least two sequences."
1310
            )
1311
            raise ValueError(
6✔
1312
                msg,
1313
            )
1314

1315
        dist_calc_app = get_approx_dist_calc(
6✔
1316
            dist=calc,
1317
            num_states=len(self.moltype.alphabet),
1318
        )
1319

1320
        return dist_calc_app(self)
6✔
1321

1322
    def copy_annotations(self, seq_db: SupportsFeatures) -> None:
6✔
1323
        """copy annotations into attached annotation db
1324

1325
        Parameters
1326
        ----------
1327
        seq_db
1328
            compatible annotation db
1329

1330
        Notes
1331
        -----
1332
        Only copies annotations for records with seqid in self.names
1333
        """
1334
        if not isinstance(seq_db, SupportsFeatures):
6✔
1335
            msg = f"type {type(seq_db)} does not match SupportsFeatures interface"
6✔
1336
            raise TypeError(
6✔
1337
                msg,
1338
            )
1339

1340
        num = 0
6✔
1341
        for seqid in self.names:
6✔
1342
            num += seq_db.num_matches(seqid=seqid)
6✔
1343
            if num > 0:
6✔
1344
                break
6✔
1345
        else:
1346
            # no matching ID's, nothing to do
1347
            return
6✔
1348

1349
        if not self._annotation_db:
6✔
1350
            self.replace_annotation_db(type(seq_db)())
6✔
1351

1352
        if self.annotation_db.compatible(seq_db, symmetric=False):
6✔
1353
            # our db contains the tables in other, so we update in place
1354
            self.annotation_db.update(annot_db=seq_db, seqids=self.names)
6✔
1355
        else:
1356
            # we use the union method to define a new one
1357
            # the setter handles propagation of the new instance to bound
1358
            # sequences
1359
            self.annotation_db = self.annotation_db.union(seq_db)
6✔
1360

1361
    def make_feature(
6✔
1362
        self,
1363
        *,
1364
        feature: FeatureDataType,
1365
    ) -> Feature[Alignment]:
1366
        """
1367
        create a feature on named sequence, or on the collection itself
1368

1369
        Parameters
1370
        ----------
1371
        feature
1372
            a dict with all the necessary data to construct a feature
1373

1374
        Returns
1375
        -------
1376
        Feature
1377

1378
        Notes
1379
        -----
1380
        To get a feature AND add it to annotation_db, use add_feature().
1381
        """
1382
        return self.seqs[feature["seqid"]].make_feature(feature)
6✔
1383

1384
    def add_feature(
6✔
1385
        self,
1386
        *,
1387
        seqid: str,
1388
        biotype: str,
1389
        name: str,
1390
        spans: list[tuple[int, int]],
1391
        parent_id: OptStr = None,
1392
        strand: str | int = "+",
1393
    ) -> Feature[Alignment]:
1394
        """
1395
        add feature on named sequence
1396

1397
        Parameters
1398
        ----------
1399
        seqid
1400
            seq name to associate with
1401
        parent_id
1402
            name of the parent feature
1403
        biotype
1404
            biological type
1405
        name
1406
            feature name
1407
        spans
1408
            plus strand coordinates
1409
        strand
1410
            either '+' or '-'
1411

1412
        Returns
1413
        -------
1414
        Feature
1415
        """
1416
        if seqid and seqid not in self.names:
6✔
1417
            msg = f"unknown {seqid=}"
6✔
1418
            raise ValueError(msg)
6✔
1419

1420
        feature = {k: v for k, v in locals().items() if k != "self"}
6✔
1421
        feature["strand"] = Strand.from_value(strand).value
6✔
1422
        # property ensures db is created
1423
        self.annotation_db.add_feature(**feature)
6✔
1424
        feature.pop("parent_id", None)
6✔
1425
        return self.make_feature(feature=feature)
6✔
1426

1427
    def get_features(
6✔
1428
        self,
1429
        *,
1430
        seqid: str | Iterator[str] | None = None,
1431
        biotype: OptStr = None,
1432
        name: OptStr = None,
1433
        start: OptInt = None,
1434
        stop: OptInt = None,
1435
        allow_partial: bool = False,
1436
        **kwargs,
1437
    ) -> Iterator[Feature[Alignment]]:
1438
        """yields Feature instances
1439

1440
        Parameters
1441
        ----------
1442
        seqid
1443
            limit search to features on this named sequence, defaults to search all
1444
        biotype
1445
            biotype of the feature, e.g. CDS, gene
1446
        name
1447
            name of the feature
1448
        start
1449
            start position of the feature (not inclusive)
1450
        stop
1451
            stop position of the feature (inclusive)
1452
        allow_partial
1453
            allow features partially overlaping self
1454
        kwargs
1455
            additional keyword arguments to query the annotation db
1456

1457
        Notes
1458
        -----
1459
        - When dealing with a nucleic acid moltype, the returned features will
1460
        yield a sequence segment that is consistently oriented irrespective
1461
        of strand of the current instance.
1462
        - start is non-inclusive, so if allow_partial is False, only features
1463
        strictly starting after start will be returned.
1464
        """
1465

1466
        if not self._annotation_db:
6✔
1467
            return None
6✔
1468

1469
        if seqid and (seqid not in self.names):
6✔
1470
            msg = f"unknown {seqid=}"
6✔
1471
            raise ValueError(msg)
6✔
1472

1473
        seqids = [seqid] if isinstance(seqid, str) else self.names
6✔
1474
        for seqid in seqids:
6✔
1475
            seq = self.seqs[seqid]
6✔
1476
            yield from seq.get_features(
6✔
1477
                biotype=biotype,
1478
                name=name,
1479
                start=start,
1480
                stop=stop,
1481
                allow_partial=allow_partial,
1482
                **kwargs,
1483
            )
1484

1485
    def to_fasta(self, block_size: int = 60) -> str:
6✔
1486
        """Return collection in Fasta format.
1487

1488
        Parameters
1489
        ----------
1490
        block_size
1491
            the sequence length to write to each line,
1492
            by default 60
1493

1494
        Returns
1495
        -------
1496
        The collection in Fasta format.
1497
        """
1498
        fasta = cogent3._plugin.get_seq_format_writer_plugin(format_name="fasta")  # noqa: SLF001
6✔
1499
        return fasta.formatted(self, block_size=block_size)
6✔
1500

1501
    def write(self, filename: str, format_name: OptStr = None, **kwargs) -> None:
6✔
1502
        """Write the sequences to a file, preserving order of sequences.
1503

1504
        Parameters
1505
        ----------
1506
        filename
1507
            name of the sequence file
1508
        format_name
1509
            format of the sequence file
1510

1511
        Notes
1512
        -----
1513

1514
        If format_name is None, will attempt to infer format from the filename
1515
        suffix.
1516
        """
1517

1518
        suffix, _ = get_format_suffixes(filename)
6✔
1519
        if format_name is None and suffix:
6✔
1520
            format_name = suffix
6✔
1521

1522
        if format_name == "json":
6✔
1523
            with atomic_write(filename, mode="wt") as f:
6✔
1524
                f.write(self.to_json())
6✔
1525
            return
6✔
1526

1527
        if "order" not in kwargs:
6✔
1528
            kwargs["order"] = self.names
6✔
1529

1530
        writer = cogent3._plugin.get_seq_format_writer_plugin(  # noqa: SLF001
6✔
1531
            format_name=format_name,
1532
            file_suffix=suffix,
1533
            unaligned_seqs=type(self) == SequenceCollection,
1534
        )
1535
        _ = writer.write(seqcoll=self, path=filename, **kwargs)
6✔
1536

1537
    def dotplot(
6✔
1538
        self,
1539
        name1: OptStr = None,
1540
        name2: OptStr = None,
1541
        window: int = 20,
1542
        threshold: OptInt = None,
1543
        k: OptInt = None,
1544
        min_gap: int = 0,
1545
        width: int = 500,
1546
        title: OptStr = None,
1547
        rc: bool = False,
1548
        biotype: str | tuple[str] | None = None,
1549
        show_progress: bool = False,
1550
    ):
1551
        """make a dotplot between specified sequences. Random sequences
1552
        chosen if names not provided.
1553

1554
        Parameters
1555
        ----------
1556
        name1, name2
1557
            names of sequences -- if not provided, a random choice is made
1558
        window
1559
            segment size for comparison between sequences
1560
        threshold
1561
            windows where the sequences are identical >= threshold are a match
1562
        k
1563
            size of k-mer to break sequences into. Larger values increase
1564
            speed but reduce resolution. If not specified, and
1565
            window == threshold, then k is set to window. Otherwise, it is
1566
            computed as the maximum of {threshold // (window - threshold), 5}.
1567
        min_gap
1568
            permitted gap for joining adjacent line segments, default is no gap
1569
            joining
1570
        width
1571
            figure width. Figure height is computed based on the ratio of
1572
            len(seq1) / len(seq2)
1573
        title
1574
            title for the plot
1575
        rc
1576
            include dotplot of reverse compliment also. Only applies to Nucleic
1577
            acids moltypes
1578
        biotype
1579
            if selected sequences are annotated, display only these biotypes
1580

1581
        Returns
1582
        -------
1583
        a Drawable or AnnotatedDrawable
1584
        """
1585
        from cogent3.draw.dotplot import Dotplot
6✔
1586
        from cogent3.draw.drawable import AnnotatedDrawable
6✔
1587

1588
        randgen = numpy.random.default_rng()
6✔
1589
        if k is not None and not (0 < k < window):
6✔
1590
            msg = f"{k=} must be positive, < {window=} and < {threshold=}"
6✔
1591
            raise ValueError(msg)
6✔
1592

1593
        if len(self.names) == 1:
6✔
1594
            name1 = name2 = self.names[0]
6✔
1595
        elif name1 is None and name2 is None:
6✔
1596
            name1, name2 = randgen.choice(self.names, size=2, replace=False).tolist()
6✔
1597
        elif not (name1 and name2):
6✔
1598
            names = list({*self.names, None} ^ {name1, name2})
6✔
1599
            name = next(iter(randgen.choice(names, size=1))).item()
6✔
1600
            name1 = name1 or name
6✔
1601
            name2 = name2 or name
6✔
1602

1603
        if not {name1, name2} <= set(self.names):
6✔
1604
            msg = f"{name1}, {name2} missing"
6✔
1605
            raise ValueError(msg)
6✔
1606

1607
        seq1 = self.seqs[name1]
6✔
1608
        seq2 = self.seqs[name2]
6✔
1609
        if not self._annotation_db:
6✔
1610
            annotated = False
6✔
1611
        else:
1612
            annotated = any(
6✔
1613
                self.annotation_db.num_matches(seqid=self._name_map[n], biotype=biotype)
1614
                for n in [name1, name2]
1615
            )
1616
        dotplot = Dotplot(
6✔
1617
            seq1,
1618
            seq2,
1619
            isinstance(self, Alignment),
1620
            window=window,
1621
            threshold=threshold,
1622
            k=k,
1623
            min_gap=min_gap,
1624
            xtitle=None if annotated else seq1.name,
1625
            ytitle=None if annotated else seq2.name,
1626
            title=title,
1627
            moltype=self.moltype,
1628
            rc=rc,
1629
            show_progress=show_progress,
1630
            width=width,
1631
        )
1632

1633
        if annotated:
6✔
1634
            data = getattr(seq1, "seq", seq1)
6✔
1635
            bottom = data.get_drawable(biotype=biotype)
6✔
1636
            data = getattr(seq2, "seq", seq2)
6✔
1637
            left = data.get_drawable(biotype=biotype, vertical=True)
6✔
1638
            dotplot = AnnotatedDrawable(
6✔
1639
                dotplot,
1640
                left_track=left,
1641
                bottom_track=bottom,
1642
                xtitle=seq1.name,
1643
                ytitle=seq2.name,
1644
                title=title,
1645
                xrange=[0, len(seq1)],
1646
                yrange=[0, len(seq2)],
1647
            )
1648

1649
        return dotplot
6✔
1650

1651
    @UI.display_wrap
6✔
1652
    def apply_pssm(
6✔
1653
        self,
1654
        pssm: PSSM = None,
1655
        path: OptStr = None,
1656
        background: numpy.ndarray = None,
1657
        pseudocount: int = 0,
1658
        names: OptList = None,
1659
        ui=None,
1660
    ) -> numpy.array:  # refactor: design: move to rich for progress bars?
1661
        """scores sequences using the specified pssm
1662

1663
        Parameters
1664
        ----------
1665
        pssm :
1666
            A profile.PSSM instance, if not provided, will be loaded from path
1667
        path
1668
            path to either a jaspar or cisbp matrix (path must end have a suffix
1669
            matching the format).
1670
        background
1671
            background frequencies distribution
1672
        pseudocount
1673
            adjustment for zero in matrix
1674
        names
1675
            returns only scores for these sequences and in the name order
1676

1677
        Returns
1678
        -------
1679
        numpy array of log2 based scores at every position
1680
        """
1681
        assert not self.is_ragged(), "all sequences must have same length"
6✔
1682
        assert pssm or path, "Must specify a PSSM or a path"
6✔
1683
        assert not (pssm and path), "Can only specify one of pssm, path"
6✔
1684

1685
        if isinstance(names, str):
6✔
1686
            names = [names]
6✔
1687

1688
        if path:
6✔
1689
            pssm = load_pssm(path, background=background, pseudocount=pseudocount)
6✔
1690

1691
        assert set(pssm.motifs) == set(self.moltype)
6✔
1692

1693
        seqs = [self.seqs[n] for n in names] if names else self.seqs
6✔
1694
        result = [pssm.score_seq(seq) for seq in ui.series(seqs)]
6✔
1695

1696
        return numpy.array(result)
6✔
1697

1698
    def get_ambiguous_positions(self):
6✔
1699
        """Returns dict of seq:{position:char} for ambiguous chars.
1700

1701
        Used in likelihood calculations.
1702
        """
1703
        result = {}
6✔
1704
        alpha = self.moltype.most_degen_alphabet()
6✔
1705
        for name in self.names:
6✔
1706
            result[name] = ambig = {}
6✔
1707
            array = numpy.array(self.seqs[name])
6✔
1708
            for i in numpy.where(array > alpha.gap_index)[0]:
6✔
1709
                ambig[i] = alpha[array[i]]
6✔
1710
        return result
6✔
1711

1712
    def trim_stop_codons(
6✔
1713
        self, gc: c3_genetic_code.GeneticCodeChoiceType = 1, strict: bool = False
1714
    ) -> Self:
1715
        """Removes any terminal stop codons from the sequences
1716

1717
        Parameters
1718
        ----------
1719
        gc
1720
            valid input to cogent3.get_code(), a genetic code object, number
1721
            or name, defaults to standard code
1722
        strict
1723
            If True, raises an exception if a seq length not divisible by 3
1724
        """
1725
        if not self.has_terminal_stop(gc=gc, strict=strict):
6✔
1726
            return self
6✔
1727

1728
        new_seqs = {
6✔
1729
            self.name_map[s.name]: s.trim_stop_codon(gc=gc, strict=strict).to_array(
1730
                apply_transforms=False,
1731
            )
1732
            for s in self.seqs
1733
        }
1734

1735
        init_kwargs = self._get_init_kwargs()
6✔
1736
        init_kwargs["seqs_data"] = self._seqs_data.from_seqs(
6✔
1737
            data=new_seqs,
1738
            alphabet=self._seqs_data.alphabet,
1739
            offset=self._seqs_data.offset,
1740
            check=False,
1741
        )
1742
        init_kwargs["annotation_db"] = self._annotation_db
6✔
1743
        return self.__class__(**init_kwargs)
6✔
1744

1745
    def counts_per_seq(
6✔
1746
        self,
1747
        motif_length: int = 1,
1748
        include_ambiguity: bool = False,
1749
        allow_gap: bool = False,
1750
        exclude_unobserved: bool = False,
1751
        warn: bool = False,
1752
    ) -> MotifCountsArray:  # refactor: using array
1753
        """counts of motifs per sequence
1754

1755
        Parameters
1756
        ----------
1757
        motif_length
1758
            number of characters per tuple.
1759
        include_ambiguity
1760
            if True, motifs containing ambiguous characters from the seq moltype
1761
            are included. No expansion of those is attempted.
1762
        allow_gap
1763
            if True, motifs containing a gap character are included.
1764
        warn
1765
            warns if motif_length > 1 and collection trimmed to produce motif
1766
            columns.
1767

1768
        Notes
1769
        -----
1770

1771
        only non-overlapping motifs are counted
1772
        """
1773
        counts = []
6✔
1774
        motifs = set()
6✔
1775
        for name in self.names:
6✔
1776
            seq = self.get_seq(name)
6✔
1777
            c = seq.counts(
6✔
1778
                motif_length=motif_length,
1779
                include_ambiguity=include_ambiguity,
1780
                allow_gap=allow_gap,
1781
                warn=warn,
1782
            )
1783
            motifs.update(c.keys())
6✔
1784
            counts.append(c)
6✔
1785
        # use motifs from moltype if empty sequences
1786
        motifs = sorted(motifs) or sorted(self.moltype)
6✔
1787
        for i, c in enumerate(counts):
6✔
1788
            counts[i] = c.tolist(motifs)
6✔
1789
        return MotifCountsArray(counts, motifs, row_indices=self.names)
6✔
1790

1791
    def counts(
6✔
1792
        self,
1793
        motif_length: int = 1,
1794
        include_ambiguity: bool = False,
1795
        allow_gap: bool = False,
1796
        exclude_unobserved: bool = False,
1797
    ) -> MotifCountsArray:
1798
        """counts of motifs
1799

1800
        Parameters
1801
        ----------
1802
        motif_length
1803
            number of elements per character.
1804
        include_ambiguity
1805
            if True, motifs containing ambiguous characters from the seq moltype
1806
            are included. No expansion of those is attempted.
1807
        allow_gap
1808
            if True, motifs containing a gap character are included.
1809
        exclude_unobserved
1810
            if True, unobserved motif combinations are excluded.
1811

1812
        Notes
1813
        -----
1814

1815
        only non-overlapping motifs are counted
1816
        """
1817
        per_seq = self.counts_per_seq(
6✔
1818
            motif_length=motif_length,
1819
            include_ambiguity=include_ambiguity,
1820
            allow_gap=allow_gap,
1821
            exclude_unobserved=exclude_unobserved,
1822
        )
1823
        return per_seq.motif_totals()
6✔
1824

1825
    def count_ambiguous_per_seq(self) -> DictArray:
6✔
1826
        """Counts of ambiguous characters per sequence."""
1827

1828
        darr = DictArrayTemplate(self.names)
6✔
1829
        counts = numpy.array(
6✔
1830
            [self.seqs[name].count_ambiguous() for name in self.names],
1831
            dtype=numpy.uint32,
1832
        )
1833

1834
        return darr.wrap(counts)
6✔
1835

1836
    def get_motif_probs(
6✔
1837
        self,
1838
        alphabet: c3_alphabet.AlphabetABC = None,
1839
        include_ambiguity: bool = False,
1840
        exclude_unobserved: bool = False,
1841
        allow_gap: bool = False,
1842
        pseudocount: int = 0,
1843
    ) -> dict:  # refactor: using array
1844
        """Return a dictionary of motif probs, calculated as the averaged
1845
        frequency across sequences.
1846

1847
        Parameters
1848
        ----------
1849
        alphabet
1850
            alphabet to use for motifs
1851
        include_ambiguity
1852
            if True resolved ambiguous codes are included in estimation of
1853
            frequencies.
1854
        exclude_unobserved
1855
            if True, motifs that are not present in the alignment are excluded
1856
            from the returned dictionary.
1857
        allow_gap
1858
            allow gap motif
1859
        pseudocount
1860
            value to add to each count
1861

1862
        Notes
1863
        -----
1864

1865
        only non-overlapping motifs are counted
1866
        """
1867
        moltype = self.moltype
6✔
1868
        if alphabet is None:
6✔
1869
            alphabet = moltype.alphabet
6✔
1870
            if allow_gap:
6✔
1871
                alphabet = moltype.gapped_alphabet
6✔
1872

1873
        motif_len = alphabet.motif_len
6✔
1874
        counts = Counter()
6✔
1875
        for seq_name in self.names:
6✔
1876
            sequence = self.seqs[seq_name]
6✔
1877
            if motif_len > 1:
6✔
1878
                sequence = [
6✔
1879
                    str(sequence[i : i + motif_len])
1880
                    for i in range(0, len(sequence) + 1 - motif_len, motif_len)
1881
                ]
1882
            for motif in sequence:
6✔
1883
                if not allow_gap and self.moltype.gap in motif:
6✔
1884
                    continue
6✔
1885

1886
                counts[motif] += 1
6✔
1887

1888
        probs = {}
6✔
1889
        if not exclude_unobserved:
6✔
1890
            for motif in alphabet:
6✔
1891
                probs[motif] = pseudocount
6✔
1892

1893
        for motif, count in list(counts.items()):
6✔
1894
            motif_set = moltype.resolve_ambiguity(motif, alphabet=alphabet)
6✔
1895
            if len(motif_set) > 1:
6✔
1896
                if include_ambiguity:
6✔
1897
                    count = float(count) / len(motif_set)
6✔
1898
                else:
1899
                    continue
6✔
1900
            for motif in motif_set:
6✔
1901
                probs[motif] = probs.get(motif, pseudocount) + count
6✔
1902

1903
        total = float(sum(probs.values()))
6✔
1904
        for motif in probs:
6✔
1905
            probs[motif] /= total
6✔
1906

1907
        return probs
6✔
1908

1909
    def probs_per_seq(
6✔
1910
        self,
1911
        motif_length: int = 1,
1912
        include_ambiguity: bool = False,
1913
        allow_gap: bool = False,
1914
        exclude_unobserved: bool = False,
1915
        warn: bool = False,
1916
    ) -> MotifFreqsArray:
1917
        """return frequency array of motifs per sequence
1918

1919
        Parameters
1920
        ----------
1921
        motif_length
1922
            number of characters per motif
1923
        include_ambiguity
1924
            if True, include motifs containing ambiguous characters
1925
        allow_gap
1926
            if True, include motifs containing a gap character
1927
        exclude_unobserved
1928
            if True, exclude motifs not present in the sequences in
1929
            the resulting array
1930
        warn
1931
            warns if motif_length > 1 and collection trimmed to produce motif
1932
            columns.
1933
        """
1934

1935
        counts = self.counts_per_seq(
6✔
1936
            motif_length=motif_length,
1937
            include_ambiguity=include_ambiguity,
1938
            allow_gap=allow_gap,
1939
            exclude_unobserved=exclude_unobserved,
1940
            warn=warn,
1941
        )
1942
        return None if counts is None else counts.to_freq_array()
6✔
1943

1944
    def entropy_per_seq(
6✔
1945
        self,
1946
        motif_length: int = 1,
1947
        include_ambiguity: bool = False,
1948
        allow_gap: bool = False,
1949
        exclude_unobserved: bool = True,
1950
        warn: bool = False,
1951
    ) -> numpy.ndarray:
1952
        """Returns the Shannon entropy per sequence.
1953

1954
        Parameters
1955
        ----------
1956
        motif_length: int
1957
            number of characters per tuple.
1958
        include_ambiguity: bool
1959
            if True, motifs containing ambiguous characters
1960
            from the seq moltype are included. No expansion of those is attempted.
1961
        allow_gap: bool
1962
            if True, motifs containing a gap character are included.
1963
        exclude_unobserved: bool
1964
            if True, unobserved motif combinations are excluded.
1965
        warn
1966
            warns if motif_length > 1 and alignment trimmed to produce
1967
            motif columns
1968

1969
        Notes
1970
        -----
1971
        For motif_length > 1, it's advisable to specify exclude_unobserved=True,
1972
        this avoids unnecessary calculations.
1973
        """
1974
        probs = self.probs_per_seq(
6✔
1975
            motif_length=motif_length,
1976
            include_ambiguity=include_ambiguity,
1977
            allow_gap=allow_gap,
1978
            exclude_unobserved=exclude_unobserved,
1979
            warn=warn,
1980
        )
1981

1982
        return None if probs is None else probs.entropy()
6✔
1983

1984
    def get_lengths(
6✔
1985
        self,
1986
        include_ambiguity: bool = False,
1987
        allow_gap: bool = False,
1988
    ) -> dict[str, int]:
1989
        """returns sequence lengths as a dict of {seqid: length}
1990

1991
        Parameters
1992
        ----------
1993
        include_ambiguity
1994
            if True, motifs containing ambiguous characters
1995
            from the seq moltype are included. No expansion of those is attempted.
1996
        allow_gap
1997
            if True, motifs containing a gap character are included.
1998

1999
        """
2000
        counts = self.counts_per_seq(
6✔
2001
            motif_length=1,
2002
            include_ambiguity=include_ambiguity,
2003
            allow_gap=allow_gap,
2004
        )
2005
        return counts.row_sum()
6✔
2006

2007
    def pad_seqs(self, pad_length: OptInt = None):
6✔
2008
        """Returns copy in which sequences are padded with the gap character to same length.
2009

2010
        Parameters
2011
        ----------
2012
        pad_length
2013
            Length all sequences are to be padded to. Will pad to max sequence
2014
            length if pad_length is None or less than max length.
2015
        """
2016

2017
        max_len = max(
6✔
2018
            self._seqs_data.get_seq_length(n) for n in self._name_map.values()
2019
        )
2020

2021
        if pad_length is None or pad_length < max_len:
6✔
2022
            pad_length = max_len
6✔
2023

2024
        padded_seqs = {}
6✔
2025
        for seq in self.seqs:
6✔
2026
            padded_seq = numpy.full(
6✔
2027
                shape=pad_length,
2028
                fill_value=self.moltype.gapped_alphabet.gap_index,
2029
                dtype=self.moltype.most_degen_alphabet().dtype,
2030
            )
2031
            padded_seq[: len(seq)] = numpy.array(seq)
6✔
2032
            # the padded_seqs dict will be used to create the seqs_data, so the
2033
            # keys should be the seqids from the original seqs_data, if this differs
2034
            # from the seq name, this will be recorded in the name_map
2035
            padded_seqs[self.name_map[seq.name]] = padded_seq
6✔
2036

2037
        init_kwargs = self._get_init_kwargs()
6✔
2038
        # when we access .seqs, if the collection has been reversed, this will have
2039
        # been applied to the returned Sequence, so we reset it to False for the
2040
        # returned collection
2041
        init_kwargs["seqs_data"] = self._seqs_data.from_seqs(
6✔
2042
            data=padded_seqs,
2043
            alphabet=self._seqs_data.alphabet,
2044
            offset=self._seqs_data.offset,
2045
        )
2046
        init_kwargs["is_reversed"] = False
6✔
2047
        return self.__class__(**init_kwargs)
6✔
2048

2049
    def strand_symmetry(self, motif_length: int = 1):
6✔
2050
        """returns dict of strand symmetry test results per seq"""
2051
        return {s.name: s.strand_symmetry(motif_length=motif_length) for s in self.seqs}
6✔
2052

2053
    def is_ragged(self) -> bool:
6✔
2054
        """rerturns True if sequences are of different lengths"""
2055
        return (
6✔
2056
            len({self._seqs_data.get_seq_length(n) for n in self._name_map.values()})
2057
            > 1
2058
        )
2059

2060
    def has_terminal_stop(
6✔
2061
        self, gc: c3_genetic_code.GeneticCodeChoiceType = 1, strict: bool = False
2062
    ) -> bool:
2063
        """Returns True if any sequence has a terminal stop codon.
2064

2065
        Parameters
2066
        ----------
2067
        gc
2068
            valid input to cogent3.get_code(), a genetic code object, number
2069
            or name
2070
        strict
2071
            If True, raises an exception if a seq length not divisible by 3
2072
        """
2073

2074
        for seq_name in self.names:
6✔
2075
            seq = self.seqs[seq_name]
6✔
2076
            if seq.has_terminal_stop(gc=gc, strict=strict):
6✔
2077
                return True
6✔
2078
        return False
6✔
2079

2080
    def get_identical_sets(
6✔
2081
        self,
2082
        mask_degen: bool = False,
2083
    ) -> list[set]:  # refactor: array/simplify
2084
        """returns sets of names for sequences that are identical
2085

2086
        Parameters
2087
        ----------
2088
        mask_degen
2089
            if True, degenerate characters are ignored
2090

2091
        """
2092

2093
        if self.is_ragged():
6✔
2094
            msg = "not all seqs same length, cannot get identical sets"
6✔
2095
            raise ValueError(msg)
6✔
2096

2097
        if mask_degen and not self.moltype.degen_alphabet:
6✔
2098
            warnings.warn(
6✔
2099
                "in get_identical_sets, mask_degen has no effect as moltype "
2100
                f"{self.moltype.label!r} has no degenerate characters",
2101
                UserWarning,
2102
                stacklevel=2,
2103
            )
2104
            mask_degen = False
6✔
2105

2106
        def reduced(seq, indices):
6✔
2107
            return "".join(seq[i] for i in range(len(seq)) if i not in indices)
6✔
2108

2109
        identical_sets = []
6✔
2110
        seen = []
6✔
2111

2112
        # if strict, we do a sort and one pass through the list
2113
        seqs = self.to_dict()
6✔
2114
        if not mask_degen:
6✔
2115
            seqs_names = [(s, n) for n, s in seqs.items()]
6✔
2116
            seqs_names.sort()
6✔
2117
            matched = None
6✔
2118
            dupes = defaultdict(set)
6✔
2119
            for i in range(len(seqs_names) - 1):
6✔
2120
                if seqs_names[i][0] == seqs_names[i + 1][0]:
6✔
2121
                    matched = seqs_names[i][1] if matched is None else matched
6✔
2122
                    dupes[matched].update([seqs_names[i + 1][1], matched])
6✔
2123
                else:
2124
                    matched = None
6✔
2125
            return list(dupes.values())
6✔
2126

2127
        mask_posns = {
6✔
2128
            name: self.moltype.get_degenerate_positions(seq, include_gap=True)
2129
            for name, seq in seqs.items()
2130
        }
2131

2132
        for i in range(len(self.names) - 1):
6✔
2133
            n1 = self.names[i]
6✔
2134
            if n1 in seen:
6✔
2135
                continue
6✔
2136

2137
            seq1 = seqs[n1]
6✔
2138
            group = set()
6✔
2139
            for j in range(i + 1, len(self.names)):
6✔
2140
                n2 = self.names[j]
6✔
2141
                if n2 in seen:
6✔
2142
                    continue
×
2143

2144
                seq2 = seqs[n2]
6✔
2145
                pos = mask_posns[n1] + mask_posns[n2]
6✔
2146

2147
                if pos:
6✔
2148
                    seq1 = reduced(seq1, pos)
6✔
2149
                    seq2 = reduced(seq2, pos)
6✔
2150

2151
                if seq1 == seq2:
6✔
2152
                    seen.append(n2)
6✔
2153
                    group.update([n1, n2])
6✔
2154

2155
            if group:
6✔
2156
                identical_sets.append(group)
6✔
2157

2158
        return identical_sets
6✔
2159

2160
    def get_similar(
6✔
2161
        self,
2162
        target: c3_sequence.Sequence,
2163
        min_similarity: float = 0.0,
2164
        max_similarity: float = 1.0,
2165
        metric: Callable[
2166
            [c3_sequence.Sequence, c3_sequence.Sequence],
2167
            float,
2168
        ] = c3_sequence.frac_same,
2169
        transform: bool | None = None,
2170
    ) -> SequenceCollection:
2171
        """Returns new SequenceCollection containing sequences similar to target.
2172

2173
        Parameters
2174
        ----------
2175
        target
2176
            sequence object to compare to. Can be in the collection.
2177
        min_similarity
2178
            minimum similarity that will be kept. Default 0.0.
2179
        max_similarity
2180
            maximum similarity that will be kept. Default 1.0.
2181
        metric
2182
            a similarity function to use. Must be f(first_seq, second_seq).
2183
            The default metric is fraction similarity, ranging from 0.0 (0%
2184
            identical) to 1.0 (100% identical). The Sequence class have lots
2185
            of methods that can be passed in as unbound methods to act as the
2186
            metric, e.g. frac_same_gaps.
2187
        transform
2188
            transformation function to use on the sequences before the metric
2189
            is calculated. If None, uses the whole sequences in each case. A
2190
            frequent transformation is a function that returns a specified range
2191
            of a sequence, e.g. eliminating the ends. Note that the transform
2192
            applies to both the real sequence and the target sequence.
2193

2194
        Notes
2195
        -----
2196
        both min_similarity and max_similarity are inclusive.
2197

2198
        Warnings
2199
        --------
2200
        if the transformation changes the type of the sequence (e.g. extracting
2201
        a string from an RnaSequence object), distance metrics that depend on
2202
        instance data of the original class may fail.
2203
        """
2204
        if transform:
6✔
2205
            target = transform(target)
6✔
2206

2207
        def m(x):
6✔
2208
            return metric(target, x)
6✔
2209

2210
        if transform:
6✔
2211

2212
            def f(x):
6✔
2213
                result = m(transform(x))
6✔
2214
                return min_similarity <= result <= max_similarity
6✔
2215

2216
        else:
2217

2218
            def f(x):
6✔
2219
                result = m(x)
6✔
2220
                return min_similarity <= result <= max_similarity
6✔
2221

2222
        return self.take_seqs_if(f)
6✔
2223

2224
    def __str__(self) -> str:
6✔
2225
        """Returns self in FASTA-format, respecting name order."""
2226
        from cogent3.format.sequence import FORMATTERS
6✔
2227

2228
        return FORMATTERS["fasta"](self.to_dict())
6✔
2229

2230
    def __eq__(self, other: object) -> bool:
6✔
2231
        if not isinstance(other, self.__class__):
6✔
2232
            return False
6✔
2233
        self_init = self._get_init_kwargs()
6✔
2234
        other_init = other._get_init_kwargs()
6✔
2235
        for key, self_val in self_init.items():
6✔
2236
            if key in ("annotation_db", "slice_record"):
6✔
2237
                continue
6✔
2238
            other_val = other_init.get(key)
6✔
2239
            if self_val != other_val:
6✔
2240
                return False
6✔
2241
        return True
6✔
2242

2243
    def __ne__(self, other: object) -> bool:
6✔
2244
        return not self == other
6✔
2245

2246
    def __repr__(self) -> str:
6✔
2247
        seqs = []
6✔
2248
        limit = 10
6✔
2249
        delimiter = ""
6✔
2250

2251
        repr_seq_names = [min(self.names, key=lambda name: len(self.seqs[name]))]
6✔
2252
        if len(self.names) > 1:
6✔
2253
            # In case of a tie, min and max return first.
2254
            # reversed ensures if all seqs are of same length, different seqs are returned
2255
            repr_seq_names.append(
6✔
2256
                max(reversed(self.names), key=lambda name: len(self.seqs[name])),
2257
            )
2258

2259
        for name in repr_seq_names:
6✔
2260
            elts = list(str(self.seqs[name])[: limit + 1])
6✔
2261
            if len(elts) > limit:
6✔
2262
                elts[-1] = "..."
6✔
2263
            seqs.append(f"{name}[{delimiter.join(elts)}]")
6✔
2264

2265
        if len(self.names) > 2:
6✔
2266
            seqs.insert(1, "...")
3✔
2267

2268
        seqs = ", ".join(seqs)
6✔
2269

2270
        return f"{len(self.names)}x {self.moltype.label} seqcollection: ({seqs})"
6✔
2271

2272
    def _repr_html_(self) -> str:
6✔
2273
        settings = self._repr_policy.copy()
6✔
2274
        env_vals = get_setting_from_environ(
6✔
2275
            "COGENT3_ALIGNMENT_REPR_POLICY",
2276
            {"num_seqs": int, "num_pos": int, "wrap": int},
2277
        )
2278
        settings.update(env_vals)
6✔
2279
        return self.to_html(
6✔
2280
            name_order=self.names[: settings["num_seqs"]],
2281
            limit=settings["num_pos"],
2282
            wrap=settings["wrap"],
2283
        )
2284

2285
    def to_html(
6✔
2286
        self,
2287
        name_order: typing.Sequence[str] | None = None,
2288
        wrap: int = 60,
2289
        limit: OptInt = None,
2290
        colors: Mapping[str, str] | None = None,
2291
        font_size: int = 12,
2292
        font_family: str = "Lucida Console",
2293
    ) -> str:
2294
        """returns html with embedded styles for sequence colouring
2295

2296
        Parameters
2297
        ----------
2298
        name_order
2299
            order of names for display.
2300
        wrap
2301
            number of columns per row
2302
        limit
2303
            truncate view of collection to this length
2304
        colors
2305
            {character
2306
            moltype.
2307
        font_size
2308
            in points. Affects labels and sequence and line spacing
2309
            (proportional to value)
2310
        font_family
2311
            string denoting font family
2312

2313
        Examples
2314
        --------
2315

2316
        In a jupyter notebook, this code is used to provide the representation.
2317

2318
        .. code-block:: python
2319

2320
            seq_col  # is rendered by jupyter
2321

2322
        You can directly use the result for display in a notebook as
2323

2324
        .. code-block:: python
2325

2326
            from IPython.core.display import HTML
2327

2328
            HTML(seq_col.to_html())
2329
        """
2330
        css, styles = self.moltype.get_css_style(
6✔
2331
            colors=colors,
2332
            font_size=font_size,
2333
            font_family=font_family,
2334
        )
2335

2336
        seq_lengths = numpy.array(
6✔
2337
            [self._seqs_data.get_seq_length(n) for n in self._name_map.values()],
2338
        )
2339
        min_val = seq_lengths.min()
6✔
2340
        max_val = seq_lengths.max()
6✔
2341
        med_val = numpy.median(seq_lengths)
6✔
2342

2343
        if name_order:
6✔
2344
            selected = self.take_seqs(name_order)
6✔
2345
        else:
2346
            name_order = self.names
6✔
2347
            selected = self
6✔
2348

2349
        # Stylise each character in each sequence
2350
        gaps = "".join(frozenset([selected.moltype.gap, selected.moltype.missing]))
6✔
2351
        template = '<span class="%s">%%s</span>'
6✔
2352
        styled_seqs = defaultdict(list)
6✔
2353
        max_truncated_len = 0
6✔
2354
        for name in name_order:
6✔
2355
            sequence = str(self.seqs[name])[:limit]
6✔
2356
            seq_len = len(sequence)
6✔
2357
            max_truncated_len = max(seq_len, max_truncated_len)
6✔
2358
            start_gap = re.search(f"^[{gaps}]+", sequence)
6✔
2359
            end_gap = re.search(f"[{gaps}]+$", sequence)
6✔
2360
            start = 0 if start_gap is None else start_gap.end()
6✔
2361
            end = seq_len if end_gap is None else end_gap.start()
6✔
2362

2363
            seq = []
6✔
2364
            for i, char in enumerate(sequence):
6✔
2365
                if i < start or i >= end:
6✔
2366
                    style = f"terminal_ambig_{self.moltype.label}"
6✔
2367
                else:
2368
                    style = styles[char]
6✔
2369
                s = template % style
6✔
2370
                s = s % char
6✔
2371
                seq.append(s)
6✔
2372

2373
            styled_seqs[name] = seq
6✔
2374

2375
        # Ensure all sublists are of same length
2376
        for name in styled_seqs:
6✔
2377
            if len(styled_seqs[name]) < max_truncated_len:
6✔
2378
                styled_seqs[name].extend(
6✔
2379
                    [""] * (max_truncated_len - len(styled_seqs[name])),
2380
                )
2381

2382
        # Make html table
2383
        seqs = numpy.array([styled_seqs[n] for n in name_order], dtype="O")
6✔
2384
        table = ["<table>"]
6✔
2385
        seq_ = "<td>%s</td>"
6✔
2386
        label_ = '<td class="label">%s</td>'
6✔
2387
        num_row_ = '<tr class="num_row"><td></td><td><b>{:,d}</b></td></tr>'
6✔
2388
        for i in range(0, max_truncated_len, wrap):
6✔
2389
            table.append(num_row_.format(i))
6✔
2390
            seqblock = seqs[:, i : i + wrap].tolist()
6✔
2391
            for n, s in zip(name_order, seqblock, strict=False):
6✔
2392
                s = "".join(s)
6✔
2393
                # Filter out rows that are empty (due to combination of shorter sequences + wrapping)
2394
                if s != "":
6✔
2395
                    row = "".join([label_ % n, seq_ % s])
6✔
2396
                    table.append(f"<tr>{row}</tr>")
6✔
2397
        table.append("</table>")
6✔
2398
        if (limit and limit < len(selected.names)) or (
6✔
2399
            name_order and len(name_order) < len(selected.names)
2400
        ):
2401
            summary = f"{self.num_seqs} x {{min={min_val}, median={med_val}, max={max_val}}} (truncated to {len(name_order) if name_order else len(selected.names)} x {limit or len(selected)}) {selected.moltype.label} sequence collection"
×
2402
        else:
2403
            summary = f"{self.num_seqs} x {{min={min_val}, median={med_val}, max={max_val}}} {selected.moltype.label} sequence collection"
6✔
2404

2405
        text = [
6✔
2406
            "<style>",
2407
            ".c3align table {margin: 10px 0;}",
2408
            ".c3align td { border: none !important; text-align: left !important; }",
2409
            ".c3align tr:not(.num_row) td span {margin: 0 2px;}",
2410
            ".c3align tr:nth-child(even) {background: #f7f7f7;}",
2411
            ".c3align .num_row {background-color:rgba(161, 195, 209, 0.5) !important; border-top: solid 1px black; }",
2412
            ".c3align .label { font-size: %dpt ; text-align: right !important; "
2413
            "color: black !important; padding: 0 4px; display: table-cell !important; "
2414
            "font-weight: normal !important; }" % font_size,
2415
            "\n".join([".c3align " + style for style in css]),
2416
            "</style>",
2417
            '<div class="c3align">',
2418
            "\n".join(table),
2419
            f"<p><i>{summary}</i></p>",
2420
            "</div>",
2421
        ]
2422
        return "\n".join(text)
6✔
2423

2424
    def set_repr_policy(
6✔
2425
        self,
2426
        num_seqs: OptInt = None,
2427
        num_pos: OptInt = None,
2428
        ref_name: OptInt = None,
2429
        wrap: OptInt = None,
2430
    ) -> None:
2431
        """specify policy for repr(self)
2432

2433
        Parameters
2434
        ----------
2435
        num_seqs
2436
            number of sequences to include in represented display.
2437
        num_pos
2438
            length of sequences to include in represented display.
2439
        ref_name
2440
            name of sequence to be placed first, or "longest" (default).
2441
            If latter, indicates longest sequence will be chosen.
2442
        wrap
2443
            number of printed bases per row
2444
        """
2445
        if num_seqs:
6✔
2446
            if not isinstance(num_seqs, int):
6✔
2447
                msg = "num_seqs is not an integer"
6✔
2448
                raise TypeError(msg)
6✔
2449
            self._repr_policy["num_seqs"] = num_seqs
6✔
2450

2451
        if num_pos:
6✔
2452
            if not isinstance(num_pos, int):
6✔
2453
                msg = "num_pos is not an integer"
6✔
2454
                raise TypeError(msg)
6✔
2455
            self._repr_policy["num_pos"] = num_pos
6✔
2456

2457
        if ref_name:
6✔
2458
            if not isinstance(ref_name, str):
6✔
2459
                msg = "ref_name is not a string"
×
2460
                raise TypeError(msg)
×
2461

2462
            if ref_name != "longest" and ref_name not in self.names:
6✔
2463
                msg = f"no sequence name matching {ref_name}"
6✔
2464
                raise ValueError(msg)
6✔
2465

2466
            self._repr_policy["ref_name"] = ref_name
6✔
2467

2468
        if wrap:
6✔
2469
            if not isinstance(wrap, int):
6✔
2470
                msg = "wrap is not an integer"
6✔
2471
                raise TypeError(msg)
6✔
2472
            self._repr_policy["wrap"] = wrap
6✔
2473

2474
    def duplicated_seqs(self) -> list[list[str]]:
6✔
2475
        """returns the names of duplicated sequences"""
2476
        seq_hashes = collections.defaultdict(list)
6✔
2477
        for n, n2 in self.name_map.items():
6✔
2478
            h = self.storage.get_hash(n2)
6✔
2479
            seq_hashes[h].append(n)
6✔
2480
        return [v for v in seq_hashes.values() if len(v) > 1]
6✔
2481

2482
    def drop_duplicated_seqs(self) -> Self:
6✔
2483
        """returns self without duplicated sequences
2484

2485
        Notes
2486
        -----
2487
        Retains the first sequence of each duplicte group.
2488
        """
2489
        dupes = self.duplicated_seqs()
6✔
2490
        if not dupes:
6✔
2491
            return self
6✔
2492

2493
        omit = []
6✔
2494
        for group in dupes:
6✔
2495
            omit.extend(group[1:])
6✔
2496
        return self.take_seqs(omit, negate=True)
6✔
2497

2498

2499
@register_deserialiser(
6✔
2500
    get_object_provenance(SequenceCollection),
2501
    "cogent3.core.alignment.SequenceCollection",
2502
)
2503
def deserialise_sequence_collection(
6✔
2504
    data: dict[str, str | dict[str, str]],
2505
) -> SequenceCollection:
2506
    """deserialise SequenceCollection"""
2507
    if "init_args" not in data:
6✔
2508
        return deserialise_old_to_new_type_seqcoll(data)
6✔
2509

2510
    return SequenceCollection.from_rich_dict(data)
6✔
2511

2512

2513
def deserialise_old_to_new_type_seqcoll(
6✔
2514
    data: dict[str, str | dict[str, str]],
2515
) -> SequenceCollection:
2516
    """deserialise old type SequenceCollection as a new type collection"""
2517
    moltype_name = data["moltype"]
6✔
2518
    moltype_name = "text" if moltype_name == "bytes" else moltype_name
6✔
2519
    info_data = data["info"]
6✔
2520
    source = info_data.pop("source", None)
6✔
2521
    seq_data = {
6✔
2522
        seqid: record["seq"]["init_args"]["seq"]
2523
        for seqid, record in data["seqs"].items()
2524
    }
2525
    return make_unaligned_seqs(
6✔
2526
        seq_data, moltype=moltype_name, info=info_data, source=source
2527
    )
2528

2529

2530
@singledispatch
6✔
2531
def merged_db_collection(seqs) -> SupportsFeatures:
6✔
2532
    """return one AnnotationDb from a collection of sequences
2533

2534
    Parameters
2535
    ----------
2536
    seqs
2537
        iterable list of data
2538

2539
    Returns
2540
    -------
2541
    list of all annotation db's
2542

2543
    Raises
2544
    ------
2545
    TypeError if different classes of AnnotationDb
2546
    """
2547
    first = None
6✔
2548
    merged = None
6✔
2549
    for seq in seqs:
6✔
2550
        if not isinstance(seq, c3_sequence.Sequence):
6✔
2551
            continue
6✔
2552

2553
        db = seq.annotation_db
6✔
2554

2555
        if first is None and db:
6✔
2556
            # TODO gah should this be a copy so immutable?
2557
            first = db
6✔
2558
            merged = first
6✔
2559
            continue
6✔
2560

2561
        if first is None or db is None or first is db:
6✔
2562
            continue
6✔
2563
        first.update(db)
6✔
2564

2565
    return merged
6✔
2566

2567

2568
@merged_db_collection.register
6✔
2569
def _(seqs: dict) -> SupportsFeatures:
6✔
2570
    return merged_db_collection(seqs.values())
6✔
2571

2572

2573
@dataclasses.dataclass
6✔
2574
class raw_seq_data:
6✔
2575
    seq: bytes | numpy.ndarray[int]
6✔
2576
    name: OptStr = None
6✔
2577
    parent_name: OptStr = None
6✔
2578
    offset: int = 0
6✔
2579
    is_reversed: bool = False
6✔
2580

2581

2582
@singledispatch
6✔
2583
def coerce_to_raw_seq_data(
6✔
2584
    seq: StrORBytesORArrayOrSeq,
2585
    moltype: MolTypes,
2586
    name: OptStr = None,
2587
) -> raw_seq_data:
2588
    """aggregates sequence data into a single object
2589

2590
    Parameters
2591
    ----------
2592
    seq
2593
        sequence data, can be a string, bytes, numpy array or Sequence
2594
        instance. The latter is converted to a numpy array.
2595
    moltype
2596
        name of a cogent3 molecular type, or a cogent3 MolType instance
2597
    name
2598
        name of the sequence
2599

2600
    Returns
2601
    -------
2602
        raw_seq_data
2603
    """
2604
    if isinstance(seq, Aligned):
6✔
2605
        # convert the Aligned instance
2606
        # into a Sequence instance that includes the gaps
2607
        return coerce_to_raw_seq_data(seq.gapped_seq, moltype, name)
6✔
2608
    msg = f"coerce_to_seq_data not implemented for {type(seq)}"
×
2609
    raise TypeError(msg)
×
2610

2611

2612
@coerce_to_raw_seq_data.register
6✔
2613
def _(
6✔
2614
    seq: c3_sequence.Sequence,
2615
    moltype: MolTypes,
2616
    name: str,
2617
) -> raw_seq_data:
2618
    # converts the sequence to a numpy array
2619
    seq = seq.to_moltype(moltype)
6✔
2620
    parent_name, start, _, step = seq.parent_coordinates()
6✔
2621
    raw_seq = numpy.array(seq)
6✔
2622
    return raw_seq_data(
6✔
2623
        seq=raw_seq,
2624
        name=name or seq.name,
2625
        parent_name=parent_name,
2626
        offset=start,
2627
        is_reversed=step < 0,
2628
    )
2629

2630

2631
@coerce_to_raw_seq_data.register
6✔
2632
def _(
6✔
2633
    seq: str,
2634
    moltype: MolTypes,
2635
    name: OptStr = None,
2636
) -> raw_seq_data:
2637
    return coerce_to_raw_seq_data(seq.encode("utf8"), moltype, name)
6✔
2638

2639

2640
@coerce_to_raw_seq_data.register
6✔
2641
def _(
6✔
2642
    seq: numpy.ndarray,
2643
    moltype: MolTypes,
2644
    name: OptStr = None,
2645
) -> raw_seq_data:
2646
    return raw_seq_data(seq=seq, name=name)
6✔
2647

2648

2649
@coerce_to_raw_seq_data.register
6✔
2650
def _(
6✔
2651
    seq: bytes,
2652
    moltype: MolTypes,
2653
    name: OptStr = None,
2654
) -> raw_seq_data:
2655
    # converts the sequence to a upper case bytes, and applies
2656
    # moltype coercion if needed (e.g. RNA to DNA replaces U with T)
2657
    seq = seq.upper()
6✔
2658
    seq = moltype.coerce_to(seq) if moltype.coerce_to else seq
6✔
2659
    return raw_seq_data(seq=seq, name=name)
6✔
2660

2661

2662
CT = tuple[dict[str, StrORBytesORArray], dict[str, int], set[str]]
6✔
2663

2664

2665
def prep_for_seqs_data(
6✔
2666
    data: dict[str, StrORBytesORArrayOrSeq],
2667
    moltype: MolTypes,
2668
    seq_namer: _SeqNamer,
2669
) -> CT:
2670
    """normalises input data for constructing a SeqsData object
2671

2672
    Parameters
2673
    ----------
2674
    data
2675
        a dict[str, StrORBytesORArray] where the key is the sequence name and
2676
        the value the sequence, or a series of Sequences instances
2677
    moltype
2678
        name of a cogent3 molecular type, or a cogent3 MolType instance
2679
    seq_namer
2680
        callback that takes the sequence name and transforms it to a new name
2681

2682
    Returns
2683
    -------
2684
    seq data as dict[str, bytes | numpy.ndarray], offsets as dict[str, int],
2685
    reversed sequences as set[str], name_map as dict[str, str]
2686
    """
2687
    seqs = {}  # for the (Aligned)SeqsDataABC
6✔
2688
    offsets = {}  # for the (Aligned)SeqsDataABC
6✔
2689
    rvd = set()
6✔
2690
    for name, seq in data.items():
6✔
2691
        name = seq_namer(seq=seq, name=name)  # noqa: PLW2901
6✔
2692
        seq_data = coerce_to_raw_seq_data(seq, moltype, name=name)
6✔
2693
        if seq_data.offset:
6✔
2694
            offsets[seq_data.parent_name or name] = seq_data.offset
6✔
2695
        seqs[seq_data.parent_name or seq_data.name] = seq_data.seq
6✔
2696
        if seq_data.is_reversed:
6✔
2697
            rvd.add(name)
6✔
2698

2699
    return seqs, offsets, rvd
6✔
2700

2701

2702
@singledispatch
6✔
2703
def _make_name_seq_mapping(
6✔
2704
    data: typing.Sequence[StrORBytesORArrayOrSeq],
2705
    seq_namer: _SeqNamer,
2706
) -> dict[str, StrORBytesORArrayOrSeq]:
2707
    """returns a dict mapping names to sequences
2708

2709
    Parameters
2710
    ----------
2711
    data
2712
        a dict of {name: seq, ...}, or python sequence of StrORBytesORArrayOrSeq
2713
    seq_namer
2714
        callback that takes the sequence and optionally a name and
2715
        returns a new name
2716
    """
2717
    if not hasattr(data, "seqs"):
6✔
2718
        msg = f"_make_name_seq_mapping not implemented for {type(data)}"
6✔
2719
        raise NotImplementedError(msg)
6✔
2720

2721
    return {seq_namer(seq=record): record for record in data.seqs}
6✔
2722

2723

2724
@_make_name_seq_mapping.register
6✔
2725
def _(
6✔
2726
    data: dict,
2727
    seq_namer: _SeqNamer,
2728
) -> dict[str, StrORBytesORArray]:
2729
    return {seq_namer(seq=seq, name=name): seq for name, seq in data.items()}
6✔
2730

2731

2732
@_make_name_seq_mapping.register
6✔
2733
def _(
6✔
2734
    data: list,
2735
    seq_namer: _SeqNamer,
2736
) -> dict[str, StrORBytesORArray]:
2737
    if isinstance(data[0], (list, tuple)):
6✔
2738
        # handle case where we've been given data like
2739
        # for example [[name, seq], ...]
2740
        with contextlib.suppress(ValueError):
6✔
2741
            return _make_name_seq_mapping(dict(data), seq_namer)
6✔
2742
    return {seq_namer(seq=record): record for record in data}
6✔
2743

2744

2745
@_make_name_seq_mapping.register
6✔
2746
def _(
6✔
2747
    data: tuple,
2748
    seq_namer: _SeqNamer,
2749
) -> dict[str, StrORBytesORArray]:
2750
    return _make_name_seq_mapping(list(data), seq_namer)
×
2751

2752

2753
@_make_name_seq_mapping.register
6✔
2754
def _(
6✔
2755
    data: set,
2756
    seq_namer: _SeqNamer,
2757
) -> dict[str, StrORBytesORArray]:
2758
    return _make_name_seq_mapping(list(data), seq_namer)
6✔
2759

2760

2761
def _seqname_parent_name(
6✔
2762
    record: StrORBytesORArray,
2763
    name: str | None = None,
2764
) -> tuple[str, str]:
2765
    if hasattr(record, "parent_coordinates"):
6✔
2766
        parent_name, *_ = record.parent_coordinates()
6✔
2767
        return name or record.name, parent_name or name
6✔
2768
    if hasattr(record, "name"):
6✔
2769
        return name or record.name, name or record.name
×
2770

2771
    return name, name
6✔
2772

2773

2774
def make_name_map(data: dict[str, StrORBytesORArray]) -> dict[str, str]:
6✔
2775
    """returns a dict mapping names to parent names
2776

2777
    Parameters
2778
    ----------
2779
    data
2780
        a dict of {name: seq, ...}
2781

2782
    Returns
2783
    -------
2784
        empty dict if names and parent names are always equal
2785
    """
2786
    name_map = {}
6✔
2787
    for name, record in data.items():
6✔
2788
        new_name, parent_name = _seqname_parent_name(record, name=name)
6✔
2789
        if new_name == parent_name:
6✔
2790
            continue
6✔
2791
        name_map[new_name] = parent_name
6✔
2792

2793
    return name_map
6✔
2794

2795

2796
def make_unaligned_storage(
6✔
2797
    data: dict[str, bytes | numpy.ndarray[int]],
2798
    *,
2799
    moltype: MolTypes,
2800
    label_to_name: OptRenamerCallable = None,
2801
    offset: DictStrInt | None = None,
2802
    reversed_seqs: set[str] | None = None,
2803
    storage_backend: str | None = None,
2804
    **kwargs,
2805
) -> SeqsData:
2806
    """makes the unaligned storage instance for a SequenceCollection
2807

2808
    Parameters
2809
    ----------
2810
    data
2811
        {name: seq}
2812
    moltype
2813
        label or instance of a cogent3 MolType
2814
    label_to_name
2815
        callable to convert the original name to a new name
2816
    offset
2817
        {name: offset} where the offset is the start position of the
2818
        sequence in the parent sequence
2819
    reversed_seqs
2820
        set of names that are on the reverse strand of the parent sequence
2821
    storage_backend
2822
        name of a third-party storage driver to provide storage functionality
2823
    kwargs
2824
        additional keyword arguments for the storage driver
2825

2826
    Notes
2827
    -----
2828
    This function is intended for use primarly by make_unaligned_seqs function.
2829
    """
2830
    moltype = c3_moltype.get_moltype(moltype)
6✔
2831
    alphabet = moltype.most_degen_alphabet()
6✔
2832
    # if we have Sequences, we need to construct the name map before we construct
2833
    # the SeqsData object - however, if a name_map is provided, we assume that it
2834
    # corrects for any naming differences in data and skip this step
2835
    assign_names = _SeqNamer(name_func=label_to_name)
6✔
2836
    seqs_data, offs, rvd = prep_for_seqs_data(data, moltype, assign_names)
6✔
2837
    offset = offset or {}
6✔
2838
    offset = {**offs, **offset}
6✔
2839
    # seqs_data keys should be the same as the value of name_map
2840
    # name_map keys correspond to names in the sequence collection
2841
    # name_map values correspond to names in seqs_data
2842
    sd_kwargs = {
6✔
2843
        "data": seqs_data,
2844
        "alphabet": alphabet,
2845
        "offset": offset,
2846
        "reversed_seqs": reversed_seqs or rvd,
2847
        **kwargs,
2848
    }
2849
    klass = cogent3._plugin.get_unaligned_storage_driver(storage_backend)  # noqa: SLF001
6✔
2850
    return klass.from_seqs(**sd_kwargs)
6✔
2851

2852

2853
@singledispatch
6✔
2854
def make_unaligned_seqs(
6✔
2855
    data: dict[str, StrORBytesORArray] | list | SeqsDataABC,
2856
    *,
2857
    moltype: MolTypes,
2858
    label_to_name: OptRenamerCallable = None,
2859
    info: OptDict = None,
2860
    source: OptPathType = None,
2861
    annotation_db: SupportsFeatures | None = None,
2862
    offset: DictStrInt | None = None,
2863
    name_map: DictStrStr | None = None,
2864
    is_reversed: bool = False,
2865
    reversed_seqs: set[str] | None = None,
2866
    storage_backend: str | None = None,
2867
    **kwargs,
2868
) -> SequenceCollection:
2869
    """Initialise an unaligned collection of sequences.
2870

2871
    Parameters
2872
    ----------
2873
    data
2874
        sequence data, a SeqsData, a dict {name: seq, ...}, an iterable of sequences
2875
    moltype
2876
        string representation of the moltype, e.g., 'dna', 'protein'.
2877
    label_to_name
2878
        function for converting original names into other names.
2879
    info
2880
        a dict from which to make an info object
2881
    source
2882
        origins of this data, defaults to 'unknown'. Converted to a string
2883
        and added to info["source"].
2884
    annotation_db
2885
        annotation database to attach to the collection
2886
    offset
2887
        a dict mapping names to annotation offsets
2888
    name_map
2889
        a dict mapping sequence names to "parent" sequence names. The parent
2890
        name will be used for querying a annotation_db.
2891
    is_reversed
2892
        entire collection has been reverse complemented
2893
    reversed_seqs
2894
        set of names that are on the reverse strand of the parent sequence
2895
    storage_backend
2896
        name of the storage backend to use for the SeqsData object, defaults to
2897
        cogent3 builtin.
2898
    kwargs
2899
        keyword arguments for the storage driver
2900

2901
    Notes
2902
    -----
2903
    If no annotation_db is provided, but the sequences are annotated, an
2904
    annotation_db is created by merging any annotation db's found in the sequences.
2905
    If the sequences are annotated AND an annotation_db is provided, only the
2906
    annotation_db is used.
2907
    """
2908
    # refactor: design
2909
    # rename offset to offsets as it could track potentially multiple offsets
2910

2911
    if len(data) == 0:
6✔
2912
        msg = "data must be at least one sequence."
6✔
2913
        raise ValueError(msg)
6✔
2914

2915
    annotation_db = annotation_db or merged_db_collection(data)
6✔
2916
    assign_names = _SeqNamer(name_func=label_to_name)
6✔
2917
    data = _make_name_seq_mapping(data, assign_names)
6✔
2918
    if name_map is None:
6✔
2919
        name_map = make_name_map(data) or None
6✔
2920

2921
    seqs_data = make_unaligned_storage(
6✔
2922
        data,
2923
        label_to_name=label_to_name,
2924
        moltype=moltype,
2925
        offset=offset,
2926
        reversed_seqs=reversed_seqs,
2927
        storage_backend=storage_backend,
2928
        **kwargs,
2929
    )
2930
    # as they were handled in this function, we do not pass on:
2931
    # offset
2932
    # label_to_name
2933
    # reversed_seqs
2934
    # storage_backend
2935

2936
    return make_unaligned_seqs(
6✔
2937
        seqs_data,
2938
        moltype=moltype,
2939
        info=info,
2940
        source=source,
2941
        annotation_db=annotation_db,
2942
        name_map=name_map,
2943
        is_reversed=is_reversed,
2944
    )
2945

2946

2947
@make_unaligned_seqs.register
6✔
2948
def _(
6✔
2949
    data: SeqsDataABC,
2950
    *,
2951
    moltype: MolTypes,
2952
    label_to_name: OptRenamerCallable = None,
2953
    info: dict | None = None,
2954
    source: OptPathType = None,
2955
    annotation_db: SupportsFeatures = None,
2956
    offset: dict[str, int] | None = None,
2957
    name_map: dict[str, str] | None = None,
2958
    is_reversed: bool = False,
2959
) -> SequenceCollection:
2960
    moltype = c3_moltype.get_moltype(moltype)
6✔
2961
    if not moltype.is_compatible_alphabet(data.alphabet):
6✔
2962
        msg = f"Provided moltype: {moltype} is not compatible with SeqsData alphabet {data.alphabet}"
6✔
2963
        raise ValueError(
6✔
2964
            msg,
2965
        )
2966

2967
    # we cannot set offset when creating from an SeqsData
2968
    if offset:
6✔
2969
        msg = f"Setting offset is not supported for {data=}"
6✔
2970
        raise ValueError(msg)
6✔
2971

2972
    info = info if isinstance(info, dict) else {}
6✔
2973
    source = str(source) if source else str(info.pop("source", "unknown"))
6✔
2974
    seqs = SequenceCollection(
6✔
2975
        seqs_data=data,
2976
        moltype=moltype,
2977
        info=info,
2978
        annotation_db=annotation_db,
2979
        source=source,
2980
        name_map=name_map,
2981
        is_reversed=is_reversed,
2982
    )
2983
    if label_to_name:
6✔
2984
        seqs = seqs.rename_seqs(label_to_name)
6✔
2985
    return seqs
6✔
2986

2987

2988
@singledispatch
6✔
2989
def decompose_gapped_seq(
6✔
2990
    seq: typing.union[StrORBytesORArray, c3_sequence.Sequence],
2991
    *,
2992
    alphabet: c3_alphabet.AlphabetABC,
2993
    missing_as_gap: bool = True,
2994
) -> tuple[numpy.ndarray, numpy.ndarray]:
2995
    """
2996
    Takes a sequence with (or without) gaps and returns an ungapped sequence
2997
    and a map of the position and length of gaps in the original parent sequence
2998
    """
2999
    msg = f"decompose_gapped_seq not implemented for type {type(seq)}"
×
3000
    raise NotImplementedError(
×
3001
        msg,
3002
    )
3003

3004

3005
@decompose_gapped_seq.register
6✔
3006
def _(
6✔
3007
    seq: numpy.ndarray,
3008
    *,
3009
    alphabet: c3_alphabet.AlphabetABC,
3010
    missing_as_gap: bool = True,
3011
) -> tuple[numpy.ndarray, numpy.ndarray]:
3012
    if missing_as_gap and alphabet.missing_index:
6✔
3013
        missing_index = numpy.uint8(alphabet.missing_index)
6✔
3014
    else:
3015
        missing_index = -1
×
3016
    return decompose_gapped_seq_array(
6✔
3017
        seq.astype(alphabet.dtype),
3018
        alphabet.gap_index,
3019
        missing_index=missing_index,
3020
    )
3021

3022

3023
@decompose_gapped_seq.register
6✔
3024
def _(
6✔
3025
    seq: str,
3026
    *,
3027
    alphabet: c3_alphabet.AlphabetABC,
3028
    missing_as_gap: bool = True,
3029
) -> tuple[numpy.ndarray, numpy.ndarray]:
3030
    if not alphabet.is_valid(seq):
6✔
3031
        msg = f"Sequence is invalid for alphabet {alphabet}"
×
3032
        raise c3_alphabet.AlphabetError(msg)
×
3033

3034
    return decompose_gapped_seq(
6✔
3035
        alphabet.to_indices(seq),
3036
        alphabet=alphabet,
3037
        missing_as_gap=missing_as_gap,
3038
    )
3039

3040

3041
@decompose_gapped_seq.register
6✔
3042
def _(
6✔
3043
    seq: bytes,
3044
    *,
3045
    alphabet: c3_alphabet.AlphabetABC,
3046
    missing_as_gap: bool = True,
3047
) -> tuple[numpy.ndarray, numpy.ndarray]:
3048
    return decompose_gapped_seq(
6✔
3049
        seq.decode("utf-8"),
3050
        alphabet=alphabet,
3051
        missing_as_gap=missing_as_gap,
3052
    )
3053

3054

3055
@decompose_gapped_seq.register
6✔
3056
def _(
6✔
3057
    seq: c3_sequence.Sequence,
3058
    *,
3059
    alphabet: c3_alphabet.AlphabetABC,
3060
    missing_as_gap: bool = True,
3061
) -> tuple[numpy.ndarray, numpy.ndarray]:
3062
    return decompose_gapped_seq(
6✔
3063
        numpy.array(seq),
3064
        alphabet=alphabet,
3065
        missing_as_gap=missing_as_gap,
3066
    )
3067

3068

3069
@numba.jit(cache=True)
3070
def decompose_gapped_seq_array(
3071
    seq: numpy.ndarray,
3072
    gap_index: int,
3073
    missing_index: int = -1,
3074
) -> tuple[numpy.ndarray, numpy.ndarray]:  # pragma: no cover
3075
    """
3076
    extracts the ungapped sequence and gap data from a gapped sequence
3077

3078
    Parameters
3079
    ----------
3080
    seq
3081
        numpy array representing a gapped sequence
3082
    gap_index
3083
        from an alphabet
3084
    missing_index
3085
        from an alphabet, represents index for missing character
3086

3087
    Returns
3088
    -------
3089
    ungapped, [[gap_pos in sequence coords, cumulative gap length]]
3090

3091
    Notes
3092
    -----
3093
    being called by decompose_gapped_seq
3094
    A missing_index is an ambiguity code that includes the gap character.
3095
    Be careful in providing this value when dealing with sequences that
3096
    may have had a feature masking applied.
3097
    """
3098
    seqlen = len(seq)
3099
    working = numpy.empty((seqlen, numpy.int64(2)), dtype=numpy.int64)
3100

3101
    in_gap = False
3102
    num_gaps = 0
3103
    start = 0
3104
    for i, base in enumerate(seq):
3105
        gapped = base == gap_index or base == missing_index  # noqa
3106
        if gapped and not in_gap:
3107
            start = i
3108
            in_gap = True
3109
        elif not gapped and in_gap:
3110
            working[num_gaps][:] = start, i - start
3111
            num_gaps += 1
3112
            in_gap = False
3113

3114
        if gapped and i == seqlen - 1:
3115
            # end of sequence
3116
            working[num_gaps][:] = start, i - start + 1
3117
            num_gaps += 1
3118

3119
    if num_gaps == 0:
3120
        return seq, numpy.empty((0, 2), dtype=numpy.int64)
3121

3122
    gap_coords = working[:num_gaps]
3123
    gap_coords.T[1] = gap_coords.T[1].cumsum()
3124
    # get gap start positions in sequence coords
3125
    for index, cum_length in enumerate(numpy.append(0, gap_coords.T[1][:-1])):
3126
        gap_coords[index][0] -= cum_length
3127

3128
    ungapped = numpy.empty(seqlen - gap_coords.T[1][-1], dtype=seq.dtype)
3129
    seqpos = 0
3130
    for base in seq:
3131
        gapped = base == gap_index or base == missing_index  # noqa
3132
        if not gapped:
3133
            ungapped[seqpos] = base
3134
            seqpos += 1
3135

3136
    return ungapped, gap_coords
3137

3138

3139
@numba.jit(cache=True)
3140
def compose_gapped_seq(
3141
    ungapped_seq: numpy.ndarray,
3142
    gaps: numpy.ndarray,
3143
    gap_index: int,
3144
) -> numpy.ndarray:  # pragma: no cover
3145
    """reconstruct a gapped sequence from an ungapped sequence and gap data"""
3146
    if not len(gaps):
3147
        return ungapped_seq
3148

3149
    gapped_len = len(ungapped_seq) + gaps[-1, 1]
3150

3151
    gapped_seq = numpy.empty(gapped_len, dtype=ungapped_seq.dtype)
3152

3153
    pos = 0
3154
    ungapped_pos = 0
3155
    prev_gap_len = 0
3156
    for gap_pos, cum_gap_len in gaps:
3157
        gap_len = cum_gap_len - prev_gap_len
3158
        prev_gap_len = cum_gap_len
3159

3160
        gapped_seq[pos : pos + gap_pos - ungapped_pos] = ungapped_seq[
3161
            ungapped_pos:gap_pos
3162
        ]
3163
        pos += gap_pos - ungapped_pos
3164
        ungapped_pos = gap_pos
3165

3166
        gapped_seq[pos : pos + gap_len] = gap_index
3167
        pos += gap_len
3168

3169
    gapped_seq[pos:] = ungapped_seq[ungapped_pos:]
3170

3171
    return gapped_seq
3172

3173

3174
class Aligned(AnnotatableMixin):
6✔
3175
    """A single sequence in an alignment.
3176

3177
    Notes
3178
    -----
3179
    This is a wrapper around a ``AlignedDataView``. This class performs any
3180
    complementing needed. It can be cast directly to a string or numpy array,
3181
    e.g. ``numpy.array(<aligned instance>)`` returns a numpy unsigned 8-bit
3182
    integer array.
3183
    """
3184

3185
    __slots__ = ("_annotation_db", "_data", "_moltype", "_name")
6✔
3186

3187
    def __init__(
6✔
3188
        self,
3189
        data: AlignedDataView,
3190
        moltype: c3_moltype.MolType,
3191
        name: OptStr = None,
3192
        annotation_db: SupportsFeatures | list[SupportsFeatures] | None = None,
3193
    ) -> None:
3194
        self._data = data
6✔
3195
        self._moltype = moltype
6✔
3196
        self._name = name or data.seqid
6✔
3197
        self._annotation_db: list[SupportsFeatures] = self._init_annot_db_value(
6✔
3198
            annotation_db
3199
        )
3200

3201
    @classmethod
6✔
3202
    def from_map_and_seq(
6✔
3203
        cls, indel_map: IndelMap, seq: c3_sequence.Sequence
3204
    ) -> Aligned:
3205
        """Creates an Aligned instance from an indel map and a Sequence."""
3206
        moltype = seq.moltype
6✔
3207
        # refactor: design
3208
        # this is a temporary approach during migration to new_types
3209
        # to support the sequence alignment algorithms
3210
        # a better solution is to create a AlignedDataView instance the
3211
        # map and seq directly without requiring a parent AlignedSeqsData
3212
        asd = AlignedSeqsData.from_seqs_and_gaps(
6✔
3213
            seqs={seq.name: numpy.array(seq)},
3214
            gaps={seq.name: indel_map.array},
3215
            alphabet=moltype.most_degen_alphabet(),
3216
        )
3217

3218
        return cls(asd.get_view(seq.name), moltype)
6✔
3219

3220
    @classmethod
6✔
3221
    def from_map_and_aligned_data_view(
6✔
3222
        cls,
3223
        indel_map: IndelMap,
3224
        seq: AlignedDataViewABC,
3225
    ) -> Aligned:
3226
        """Creates an Aligned instance from an indel map and AlignedDataView."""
3227
        moltype = seq.alphabet.moltype
6✔
3228
        seqid = seq.seqid
6✔
3229
        seq = seq.array_value
6✔
3230
        # refactor: design
3231
        # see above comment in from_map_and_seq
3232
        asd = AlignedSeqsData.from_seqs_and_gaps(
6✔
3233
            seqs={seqid: seq},
3234
            gaps={seqid: indel_map.array},
3235
            alphabet=moltype.most_degen_alphabet(),
3236
        )
3237

3238
        return cls(asd.get_view(seqid), moltype)
6✔
3239

3240
    def __len__(self) -> int:
6✔
3241
        return len(self.map)
6✔
3242

3243
    @property
6✔
3244
    def data(self) -> AlignedDataView:
6✔
3245
        return self._data
6✔
3246

3247
    @property
6✔
3248
    def map(self) -> IndelMap:
6✔
3249
        return self.data.map
6✔
3250

3251
    @property
6✔
3252
    def seq(self) -> c3_sequence.Sequence:
6✔
3253
        """the ungapped sequence."""
3254
        # if the slice record has abs(step) > 1, we cannot retain a connection
3255
        # to the underlying aligned seq data container because the gaps are
3256
        # not going to be modulo the step.
3257
        if self.data.slice_record.plus_step == 1:
6✔
3258
            # to complement or not handled by the view
3259
            seq = self.data.get_seq_view()
6✔
3260
        elif self.data.slice_record.step > 1:
6✔
3261
            # we have a step, but no complementing will be required
3262
            seq = self.moltype.degap(self.data.gapped_array_value)
6✔
3263
        else:
3264
            # gapped_array_value gives the reverse of the plus strand
3265
            # so we need to complement it. We do that here because with a
3266
            # step != 1, we cannot retain a connection to the underlying
3267
            # annotations
3268
            seq = self.moltype.degap(self.data.gapped_array_value)
6✔
3269
            seq = self.moltype.complement(seq)
6✔
3270

3271
        mt_seq = self.moltype.make_seq(seq=seq, name=self.data.seqid)
6✔
3272
        ann_db = self._annotation_db if self.data.slice_record.plus_step == 1 else None
6✔
3273
        mt_seq.replace_annotation_db(ann_db)
6✔
3274
        return mt_seq
6✔
3275

3276
    @property
6✔
3277
    def gapped_seq(self) -> c3_sequence.Sequence:
6✔
3278
        """Returns Sequence object, including gaps."""
3279
        seq = self.data.gapped_array_value
6✔
3280
        if self.data.slice_record.step < 0:
6✔
3281
            seq = self.moltype.complement(seq)
6✔
3282
        return self.moltype.make_seq(seq=seq, name=self.data.seqid)
6✔
3283

3284
    @property
6✔
3285
    def moltype(self) -> c3_moltype.MolType:
6✔
3286
        return self._moltype
6✔
3287

3288
    @property
6✔
3289
    def name(self) -> str:
6✔
3290
        return self._name
6✔
3291

3292
    def gap_vector(self) -> list[bool]:
6✔
3293
        """Returns gap_vector of positions."""
3294
        return self.gapped_seq.gap_vector()
6✔
3295

3296
    def make_feature(
6✔
3297
        self, feature: FeatureDataType, alignment: Alignment
3298
    ) -> Feature[Alignment]:
3299
        """returns a feature, not written into annotation_db"""
3300
        annot = self.seq.make_feature(feature)
6✔
3301
        inverted = self.map.to_feature_map().inverse()
6✔
3302
        return annot.remapped_to(alignment, inverted)
6✔
3303

3304
    def __repr__(self) -> str:
6✔
3305
        seq = f"{str(self)[:7]}... {len(self):,}" if len(self) > 10 else str(self)
6✔
3306
        return (
6✔
3307
            f"Aligned(name={self.name!r}, seq={seq!r}, moltype={self.moltype.name!r})"
3308
        )
3309

3310
    def __str__(self) -> str:
6✔
3311
        return str(self.gapped_seq)
6✔
3312

3313
    def __array__(
6✔
3314
        self,
3315
        dtype: numpy.dtype | None = None,
3316
        copy: bool | None = None,
3317
    ) -> numpy.ndarray[int]:
3318
        return numpy.array(self.gapped_seq, dtype=dtype)
6✔
3319

3320
    def __bytes__(self) -> bytes:
6✔
3321
        return bytes(self.gapped_seq)
6✔
3322

3323
    def __iter__(self):
6✔
3324
        """Iterates over sequence one motif (e.g. char) at a time, incl. gaps"""
3325
        yield from self.gapped_seq
6✔
3326

3327
    @singledispatchmethod
6✔
3328
    def __getitem__(self, span: int | slice):
6✔
3329
        msg = f"__getitem__ not implemented for {type(span)}"
6✔
3330
        raise NotImplementedError(msg)
6✔
3331

3332
    @__getitem__.register
6✔
3333
    def _(self, span: int):
6✔
3334
        return self.__getitem__(slice(span, span + 1))
6✔
3335

3336
    @__getitem__.register
6✔
3337
    def _(self, span: slice):
6✔
3338
        return self.__class__(data=self.data[span], moltype=self.moltype)
6✔
3339

3340
    @__getitem__.register
6✔
3341
    def _(self, span: FeatureMap):
6✔
3342
        # we assume the feature map is in align coordinates
3343
        data, gaps = self.slice_with_map(span)
6✔
3344
        seqid = self.data.seqid
6✔
3345
        seqs_data = self.data.parent.from_seqs_and_gaps(
6✔
3346
            seqs={seqid: data},
3347
            gaps={seqid: gaps},
3348
            alphabet=self.moltype.most_degen_alphabet(),
3349
        )
3350
        view = seqs_data.get_view(seqid)
6✔
3351

3352
        return Aligned(view, self.moltype)
6✔
3353

3354
    def slice_with_map(self, span: FeatureMap) -> tuple[numpy.ndarray, numpy.ndarray]:
6✔
3355
        start, end = span.start, span.end
6✔
3356
        if span.useful and len(list(span.spans)) == 1:
6✔
3357
            im = self.map[start:end]
6✔
3358
            seq_start = self.map.get_seq_index(start)
6✔
3359
            seq_end = self.map.get_seq_index(end)
6✔
3360
            data = self.data.array_value[seq_start:seq_end]
6✔
3361
            # .array_value will return the data in the correct orientation
3362
            # which means we need to complement it if the data is reversed
3363
            data = (
6✔
3364
                self.moltype.complement(data)
3365
                if self.data.slice_record.is_reversed
3366
                else data
3367
            )
3368
        elif not span.useful:
6✔
3369
            im = self.map[start:end]
6✔
3370
            data = self.data.array_value[:0]
6✔
3371
        else:
3372
            # multiple spans
3373
            align_coords = span.get_coordinates()
6✔
3374
            im = self.map.joined_segments(align_coords)
6✔
3375
            seq_map = self.map.make_seq_feature_map(span)
6✔
3376
            # self.seq will return the data in the correct orientation
3377
            # and will complement it if the data is reversed
3378
            data = numpy.array(self.seq.gapped_by_map(seq_map))
6✔
3379

3380
        gaps = numpy.array([im.gap_pos, im.cum_gap_lengths]).T
6✔
3381
        return data, gaps
6✔
3382

3383
    def parent_coordinates(
6✔
3384
        self, seq_coords: bool = False, apply_offset: bool = False
3385
    ) -> tuple[str, int, int, int]:
3386
        """returns seqid, start, stop, strand on the parent sequence
3387

3388
        Parameters
3389
        ----------
3390
        seq_coords
3391
            if True, the coordinates for the unaligned sequence
3392
        apply_offset
3393
            if True and seq_coords, adds annotation offset from parent
3394
        """
3395
        return self.data.parent_coords(seq_coords=seq_coords, apply_offset=apply_offset)
6✔
3396

3397
    @extend_docstring_from(c3_sequence.Sequence.annotate_matches_to)
6✔
3398
    def annotate_matches_to(
6✔
3399
        self,
3400
        pattern: str,
3401
        biotype: str,
3402
        name: str,
3403
        allow_multiple: bool = False,
3404
    ):
3405
        return self.seq.annotate_matches_to(
6✔
3406
            pattern=pattern,
3407
            biotype=biotype,
3408
            name=name,
3409
            allow_multiple=allow_multiple,
3410
        )
3411

3412

3413
class AlignedSeqsDataABC(SeqsDataABC):
6✔
3414
    """Abstract base class for respresenting the storage object for sequences underlying
3415
    a Alignment.
3416
    """
3417

3418
    # all methods that are from SeqsDataABC should work in sequence coordinates,
3419
    # all methods unique to AlignedSeqsDataABC should work in aligned coordinates.
3420
    # all indices provided to AlignedSeqsDataABC should be on the plus strand.
3421
    __slots__ = ()
6✔
3422

3423
    @classmethod
3424
    @abstractmethod
3425
    def from_seqs_and_gaps(
3426
        cls,
3427
        *,
3428
        seqs: dict[str, StrORBytesORArray],
3429
        gaps: dict[str, numpy.ndarray],
3430
        alphabet: c3_alphabet.AlphabetABC,
3431
    ) -> Self: ...
3432

3433
    @abstractmethod
3434
    def __init__(
3435
        self,
3436
        *,
3437
        gapped_seqs: numpy.ndarray,
3438
        names: tuple[str],
3439
        alphabet: c3_alphabet.AlphabetABC,
3440
        ungapped_seqs: dict[str, numpy.ndarray] | None = None,
3441
        gaps: dict[str, numpy.ndarray] | None = None,
3442
        offset: DictStrInt | None = None,
3443
        align_len: OptInt = None,
3444
        check: bool = True,
3445
        reversed_seqs: set[str] | None = None,
3446
    ) -> None: ...
3447

3448
    @abstractmethod
6✔
3449
    def get_view(
6✔
3450
        self,
3451
        seqid: str,
3452
        slice_record: c3_sequence.SliceRecord | None = None,
3453
    ) -> AlignedDataViewABC:
3454
        # overriding the SeqsDataABC method as we support directly
3455
        # providing the slice_record instance
3456
        ...
3457

3458
    @classmethod
3459
    @abstractmethod
3460
    def from_names_and_array(
3461
        cls,
3462
        *,
3463
        names: list[str],
3464
        data: numpy.ndarray,
3465
        alphabet: c3_alphabet.AlphabetABC,
3466
    ) -> Self: ...
3467

3468
    @property
3469
    @abstractmethod
3470
    def align_len(self) -> int: ...
3471

3472
    @abstractmethod
3473
    def get_seq_array(
3474
        self,
3475
        *,
3476
        seqid: str,
3477
        start: OptInt = None,
3478
        stop: OptInt = None,
3479
        step: OptInt = None,
3480
    ) -> numpy.ndarray: ...
3481

3482
    @abstractmethod
3483
    def get_seq_str(
3484
        self,
3485
        *,
3486
        seqid: str,
3487
        start: OptInt = None,
3488
        stop: OptInt = None,
3489
        step: OptInt = None,
3490
    ) -> str: ...
3491

3492
    @abstractmethod
3493
    def get_seq_bytes(
3494
        self,
3495
        *,
3496
        seqid: str,
3497
        start: OptInt = None,
3498
        stop: OptInt = None,
3499
        step: OptInt = None,
3500
    ) -> bytes: ...
3501

3502
    @abstractmethod
3503
    def get_gapped_seq_array(
3504
        self,
3505
        *,
3506
        seqid: str,
3507
        start: OptInt = None,
3508
        stop: OptInt = None,
3509
        step: OptInt = None,
3510
    ) -> numpy.ndarray: ...
3511

3512
    @abstractmethod
3513
    def get_gapped_seq_str(
3514
        self,
3515
        *,
3516
        seqid: str,
3517
        start: OptInt = None,
3518
        stop: OptInt = None,
3519
        step: OptInt = None,
3520
    ) -> str: ...
3521

3522
    @abstractmethod
3523
    def get_gapped_seq_bytes(
3524
        self,
3525
        *,
3526
        seqid: str,
3527
        start: OptInt = None,
3528
        stop: OptInt = None,
3529
        step: OptInt = None,
3530
    ) -> bytes: ...
3531

3532
    @abstractmethod
6✔
3533
    def get_ungapped(
6✔
3534
        self,
3535
        name_map: dict[str, str],
3536
        start: OptInt = None,
3537
        stop: OptInt = None,
3538
        step: OptInt = None,
3539
    ) -> tuple[dict, dict]:
3540
        """
3541
        Returns a dictionary of sequence data with no gaps or missing characters and
3542
        a dictionary with information to construct a new SequenceCollection via
3543
        make_unaligned_seqs.
3544

3545
        Parameters
3546
        ----------
3547
        name_map
3548
            A dict of {aln_name: data_name, ...} indicating the mapping between
3549
            names in the encompassing Alignment (aln_name) and the names in self
3550
            (data_name).
3551
        start
3552
            The alignment starting position.
3553
        stop
3554
            The alignment stopping position.
3555
        step
3556
            The step size.
3557

3558
        Returns
3559
        -------
3560
        tuple
3561
            A tuple containing the following:
3562
            - seqs (dict): A dictionary of {name: seq, ...} where the sequences have no gaps
3563
              or missing characters.
3564
            - kwargs (dict): A dictionary of keyword arguments for make_unaligned_seqs, e.g.,
3565
              {"offset": self.offset, "name_map": name_map}.
3566
        """
3567
        ...
3568

3569
    @abstractmethod
3570
    def get_pos_range(
3571
        self,
3572
        names: PySeqStr,
3573
        start: OptInt = None,
3574
        stop: OptInt = None,
3575
        step: OptInt = None,
3576
    ) -> numpy.ndarray: ...
3577

3578
    @abstractmethod
3579
    def get_positions(
3580
        self,
3581
        names: PySeqStr,
3582
        positions: typing.Sequence[int] | numpy.ndarray[numpy.integer],
3583
    ) -> numpy.ndarray: ...
3584

3585
    @abstractmethod
3586
    def copy(self, **kwargs) -> Self: ...
3587

3588
    @abstractmethod
3589
    def variable_positions(
3590
        self,
3591
        names: PySeqStr,
3592
        start: OptInt = None,
3593
        stop: OptInt = None,
3594
        step: OptInt = None,
3595
    ) -> numpy.ndarray: ...
3596

3597

3598
def _gapped_seq_len(seq: numpy.ndarray, gap_map: numpy.ndarray) -> int:
6✔
3599
    """calculate the gapped sequence length from a ungapped sequence and gap map
3600

3601
    Parameters
3602
    ----------
3603
    seq
3604
        numpy array of sequence indices
3605
    gap_map
3606
        numpy array of [gap index, cumulative gap length] pairs
3607
    """
3608
    if isinstance(gap_map, IndelMap):
6✔
3609
        gap_map = gap_map.array
×
3610
    try:
6✔
3611
        gap_len = gap_map[-1][1]
6✔
3612
    except IndexError:  # no gaps
6✔
3613
        return len(seq)
6✔
3614

3615
    return len(seq) + gap_len
6✔
3616

3617

3618
class AlignedSeqsData(AlignedSeqsDataABC):
6✔
3619
    """The builtin ``cogent3`` implementation of aligned sequences storage
3620
    underlying an ``Alignment``. Indexing this object returns an ``AlignedDataView``
3621
    which can realise the corresponding slice as a string, bytes, or numpy array,
3622
    gapped or ungapped.
3623

3624
    Notes
3625
    -----
3626
    Methods on this object only accepts plust strand start, stop and step
3627
    indices for selecting segments of data. It can return the gap coordinates
3628
    for a sequence as used by ``IndelMap``.
3629
    """
3630

3631
    __slots__ = (
6✔
3632
        "_align_len",
3633
        "_alphabet",
3634
        "_gapped",
3635
        "_gaps",
3636
        "_hashes",
3637
        "_name_to_index",
3638
        "_names",
3639
        "_offset",
3640
        "_reversed",
3641
        "_ungapped",
3642
    )
3643

3644
    def __init__(
6✔
3645
        self,
3646
        *,
3647
        gapped_seqs: numpy.ndarray,
3648
        names: tuple[str],
3649
        alphabet: c3_alphabet.CharAlphabet,
3650
        ungapped_seqs: dict[str, numpy.ndarray] | None = None,
3651
        gaps: dict[str, numpy.ndarray] | None = None,
3652
        offset: DictStrInt | None = None,
3653
        align_len: OptInt = None,
3654
        check: bool = True,
3655
        reversed_seqs: set[str] | None = None,
3656
    ) -> None:
3657
        """
3658
        Parameters
3659
        ----------
3660
        gapped_seqs
3661
            2D numpy.uint8 array of aligned sequences. axis 0 are sequences,
3662
            axis 1 are alignment positions
3663
        names
3664
            sequence names in order matching the axis 0 of gapped_seqs
3665
        alphabet
3666
            caharacter alphabet for the sequences
3667
        ungapped_seqs
3668
            a dictionary mapping names to 1D numpy.uint8 arrays of individual
3669
            sequences without gaps. If not provided, computed on demand.
3670
        gaps
3671
            a dictionary mapping names to 1D numpy.int32 arrays of gap data,
3672
            axis 0 is a gap axis 1 is [gap position in sequence coordinates,
3673
            cumulative gap length].  If not provided, computed on demand.
3674
        offset
3675
            a dictionary of annotation offsets
3676
        align_len
3677
            length of the alignment, which must equal the gapped_seqs.shape[1]
3678
        check
3679
            validate any keys in offset, ungapped_seqs, gaps are a subset of names
3680
        reversed_seqs
3681
            names of seqs that are reverse complemented
3682
        """
3683
        self._alphabet = alphabet
6✔
3684
        self._names = tuple(names)
6✔
3685
        self._name_to_index = {name: i for i, name in enumerate(names)}
6✔
3686
        self._gapped = gapped_seqs
6✔
3687
        self._ungapped = ungapped_seqs or {}
6✔
3688
        self._gaps = gaps or {}
6✔
3689
        self._hashes: dict[str, str] = {}
6✔
3690
        align_len = align_len or gapped_seqs.shape[1]
6✔
3691
        self._reversed = frozenset(reversed_seqs or set())
6✔
3692
        if align_len:
6✔
3693
            assert align_len == gapped_seqs.shape[1], "mismatch in alignment length"
6✔
3694

3695
        self._align_len = align_len
6✔
3696
        self._offset = offset or {}
6✔
3697

3698
        if check:
6✔
3699
            if not set(names) >= set(self._gaps.keys()) or not set(names) >= set(
6✔
3700
                self._ungapped.keys(),
3701
            ):
3702
                msg = "Keys in ungapped seqs and gaps must be subsets of names."
6✔
3703
                raise ValueError(
6✔
3704
                    msg,
3705
                )
3706
            if not set(names) >= set(self._offset):
6✔
3707
                msg = "Keys in offset must be a subset of names."
6✔
3708
                raise ValueError(msg)
6✔
3709

3710
            if len(names) != gapped_seqs.shape[0]:
6✔
3711
                msg = f"{len(names)=} != {gapped_seqs.shape[0]=}"
×
3712
                raise ValueError(msg)
×
3713

3714
    def __eq__(self, other: AlignedSeqsDataABC) -> bool:
6✔
3715
        if not isinstance(other, self.__class__):
6✔
3716
            return False
×
3717
        attrs = (
6✔
3718
            "_names",
3719
            "_name_to_index",
3720
            "_alphabet",
3721
            "_align_len",
3722
            "_offset",
3723
        )
3724
        for attr_name in attrs:
6✔
3725
            self_attr = getattr(self, attr_name)
6✔
3726
            other_attr = getattr(other, attr_name)
6✔
3727
            if self_attr != other_attr:
6✔
3728
                return False
6✔
3729

3730
        return numpy.all(self._gapped == other._gapped)
6✔
3731

3732
    def __ne__(self, other: object) -> bool:
6✔
3733
        return not self == other
6✔
3734

3735
    @classmethod
6✔
3736
    def from_seqs(
6✔
3737
        cls,
3738
        *,
3739
        data: dict[str, StrORArray],
3740
        alphabet: c3_alphabet.AlphabetABC,
3741
        **kwargs,
3742
    ) -> Self:
3743
        """Construct an AlignedSeqsData object from a dict of aligned sequences
3744

3745
        Parameters
3746
        ----------
3747
        data
3748
            dict of gapped sequences {name: seq, ...}. sequences must all be
3749
            the same length
3750
        alphabet
3751
            alphabet object for the sequences
3752
        """
3753
        seq_lengths = {len(v) for v in data.values()}
6✔
3754
        if len(seq_lengths) != 1:
6✔
3755
            msg = "All sequence lengths must be the same."
6✔
3756
            raise ValueError(msg)
6✔
3757

3758
        align_len = seq_lengths.pop()
6✔
3759
        names = tuple(data.keys())
6✔
3760
        array_seqs = numpy.empty((len(names), align_len), dtype=alphabet.dtype)
6✔
3761
        for i, name in enumerate(names):
6✔
3762
            array_seqs[i] = alphabet.to_indices(data[name], validate=True)
6✔
3763

3764
        array_seqs.flags.writeable = False
6✔
3765
        return cls(
6✔
3766
            gapped_seqs=array_seqs,
3767
            alphabet=alphabet,
3768
            align_len=align_len,
3769
            check=False,
3770
            names=names,
3771
            **kwargs,
3772
        )
3773

3774
    @classmethod
6✔
3775
    def from_seqs_and_gaps(
6✔
3776
        cls,
3777
        *,
3778
        seqs: dict[str, StrORBytesORArray],
3779
        gaps: dict[str, numpy.ndarray],
3780
        alphabet: c3_alphabet.AlphabetABC,
3781
        **kwargs,
3782
    ) -> Self:
3783
        """Construct an AlignedSeqsData object from a dict of ungapped sequences
3784
        and a corresponding dict of gap data.
3785

3786
        Parameters
3787
        ----------
3788
        seqs
3789
            dict of ungapped sequences {name: seq, ...}
3790
        gaps
3791
            gap data {name: [[seq gap position, cumulative gap length], ...], ...}
3792
        alphabet
3793
            alphabet object for the sequences
3794
        """
3795
        names = tuple(kwargs.pop("names", seqs.keys()))
6✔
3796
        if not names:
6✔
3797
            msg = "seqs cannot be empty"
6✔
3798
            raise ValueError(msg)
6✔
3799

3800
        align_len = kwargs.pop("align_len", None)
6✔
3801
        if align_len is None:
6✔
3802
            align_len = _gapped_seq_len(seqs[names[0]], gaps[names[0]])
6✔
3803

3804
        gapped_seqs = numpy.empty((len(names), align_len), dtype=alphabet.dtype)
6✔
3805
        for i, name in enumerate(names):
6✔
3806
            seq = alphabet.to_indices(seqs[name])
6✔
3807
            seqs[name] = seq
6✔
3808
            if name not in gaps:
6✔
3809
                msg = f"Missing gap data for sequence {name!r}"
6✔
3810
                raise ValueError(msg)
6✔
3811
            gapped_seqs[i] = compose_gapped_seq(seq, gaps[name], alphabet.gap_index)
6✔
3812
            assert len(gapped_seqs[i]) == align_len, "aligned lengths do not match"
6✔
3813

3814
        gapped_seqs.flags.writeable = False
6✔
3815
        return cls(
6✔
3816
            ungapped_seqs=seqs,
3817
            gaps=gaps,
3818
            gapped_seqs=gapped_seqs,
3819
            alphabet=alphabet,
3820
            names=names,
3821
            align_len=align_len,
3822
            **kwargs,
3823
        )
3824

3825
    @classmethod
6✔
3826
    def from_names_and_array(
6✔
3827
        cls,
3828
        *,
3829
        names: PySeq[str],
3830
        data: numpy.ndarray,
3831
        alphabet: c3_alphabet.AlphabetABC,
3832
    ) -> Self:
3833
        """Construct an AlignedSeqsData object from a list of names and a numpy
3834
        array of aligned sequence data.
3835

3836
        Parameters
3837
        ----------
3838
        names
3839
            list of sequence names
3840
        data
3841
            numpy array of aligned sequence data
3842
        alphabet
3843
            alphabet object for the sequences
3844
        """
3845
        if len(names) != data.shape[0] or not len(names):
6✔
3846
            msg = "Number of names must match number of rows in data."
6✔
3847
            raise ValueError(msg)
6✔
3848

3849
        gapped_seqs = data.astype(alphabet.dtype)
6✔
3850
        gapped_seqs.flags.writeable = False
6✔
3851
        return cls(
6✔
3852
            gapped_seqs=gapped_seqs,
3853
            names=names,
3854
            alphabet=alphabet,
3855
        )
3856

3857
    @property
6✔
3858
    def names(self) -> tuple[str, ...]:
6✔
3859
        """returns the names of the sequences in the storage"""
3860
        return self._names
6✔
3861

3862
    @property
6✔
3863
    def reversed_seqs(self) -> frozenset[str]:
6✔
3864
        """names of sequences that are reverse complemented"""
3865
        return self._reversed
6✔
3866

3867
    @property
6✔
3868
    def alphabet(self) -> c3_alphabet.CharAlphabet:
6✔
3869
        """the character alphabet for validating, encoding, decoding sequences"""
3870
        return self._alphabet
6✔
3871

3872
    @property
6✔
3873
    def align_len(self) -> int:
6✔
3874
        """Return the length of the alignment."""
3875
        return self._align_len
6✔
3876

3877
    @property
6✔
3878
    def offset(self) -> dict[str, int]:
6✔
3879
        """returns the offset of each sequence in the Alignment"""
3880
        return {name: self._offset.get(name, 0) for name in self.names}
6✔
3881

3882
    def __len__(self) -> int:
6✔
3883
        return self.align_len
6✔
3884

3885
    @singledispatchmethod
6✔
3886
    def __getitem__(self, index: str | int) -> AlignedDataViewABC:
6✔
3887
        return self.get_view(index)
6✔
3888

3889
    def get_seq_length(self, seqid: str) -> int:
6✔
3890
        """return length of the unaligned seq for seqid"""
3891
        return len(self._get_ungapped(seqid))
6✔
3892

3893
    @singledispatchmethod
6✔
3894
    def get_view(
6✔
3895
        self,
3896
        seqid: str,
3897
        slice_record: c3_sequence.SliceRecord | None = None,
3898
    ) -> AlignedDataView:
3899
        """reurns view of aligned sequence data for seqid
3900

3901
        Parameters
3902
        ----------
3903
        seqid
3904
            sequence name
3905
        slice_record
3906
            slice record to use for slicing the data. If None, uses the
3907
            default slice record for the entire sequence.
3908
        """
3909
        return AlignedDataView(
6✔
3910
            parent=self,
3911
            seqid=seqid,
3912
            alphabet=self.alphabet,
3913
            slice_record=slice_record,
3914
        )
3915

3916
    @get_view.register
6✔
3917
    def _(self, seqid: int):
6✔
3918
        return self.get_view(self.names[seqid])
6✔
3919

3920
    def _get_gaps(self, seqid: str) -> numpy.ndarray:
6✔
3921
        if seqid not in self._gaps:
6✔
3922
            self._make_gaps_and_ungapped(seqid)
6✔
3923
        return self._gaps[seqid]
6✔
3924

3925
    def _get_ungapped(self, seqid: str) -> numpy.ndarray:
6✔
3926
        if seqid not in self._ungapped:
6✔
3927
            self._make_gaps_and_ungapped(seqid)
6✔
3928
        return self._ungapped[seqid]
6✔
3929

3930
    def get_gaps(self, seqid: str) -> numpy.ndarray:
6✔
3931
        """returns the gap data for seqid"""
3932
        return self._get_gaps(seqid)
6✔
3933

3934
    def _make_gaps_and_ungapped(self, seqid: str) -> None:
6✔
3935
        if seqid in self._gaps and seqid in self._ungapped:
6✔
3936
            # job already done
3937
            return
×
3938

3939
        index = self._name_to_index[seqid]
6✔
3940
        ungapped, gaps = decompose_gapped_seq(
6✔
3941
            self._gapped[index],
3942
            alphabet=self.alphabet,
3943
        )
3944
        self._gaps[seqid] = gaps
6✔
3945
        self._ungapped[seqid] = ungapped
6✔
3946

3947
    def get_seq_array(
6✔
3948
        self,
3949
        *,
3950
        seqid: str,
3951
        start: OptInt = None,
3952
        stop: OptInt = None,
3953
        step: OptInt = None,
3954
    ) -> numpy.ndarray:
3955
        """Return ungapped sequence corresponding to seqid as an array of indices.
3956

3957
        Notes
3958
        -----
3959
        Assumes start/stop are in sequence coordinates. If seqid is in
3960
        reversed_seqs, that sequence will be in plus strand orientation.
3961
        It is client codes responsibility to ensure the coordinates are
3962
        consistent with that.
3963
        """
3964
        start = start or 0
6✔
3965
        stop = stop if stop is not None else self.get_seq_length(seqid)
6✔
3966
        step = step or 1
6✔
3967

3968
        if start < 0 or stop < 0 or step < 1:
6✔
3969
            msg = f"{start=}, {stop=}, {step=} not >= 1"
×
3970
            raise ValueError(msg)
×
3971

3972
        out_len = (stop - start + step - 1) // step
6✔
3973

3974
        seq = numpy.empty(out_len, dtype=self.alphabet.dtype)
6✔
3975
        seq[:] = self._get_ungapped(seqid)[start:stop:step]
6✔
3976

3977
        return seq
6✔
3978

3979
    def get_gapped_seq_array(
6✔
3980
        self,
3981
        *,
3982
        seqid: str,
3983
        start: OptInt = None,
3984
        stop: OptInt = None,
3985
        step: OptInt = None,
3986
    ) -> numpy.ndarray:
3987
        """Return sequence data corresponding to seqid as an array of indices.
3988
        start/stop are in alignment coordinates. Includes gaps.
3989
        """
3990
        start = start or 0
6✔
3991
        stop = stop if stop is not None else self.align_len
6✔
3992
        step = step or 1
6✔
3993
        if start < 0 or stop < 0 or step < 1:
6✔
3994
            msg = f"{start=}, {stop=}, {step=} not >= 1"
×
3995
            raise ValueError(msg)
×
3996

3997
        index = self._name_to_index[seqid]
6✔
3998
        out_len = (stop - start + step - 1) // step
6✔
3999
        gapped = numpy.empty(out_len, dtype=self.alphabet.dtype)
6✔
4000
        gapped[:] = self._gapped[index][start:stop:step]
6✔
4001

4002
        return gapped
6✔
4003

4004
    def get_seq_str(
6✔
4005
        self,
4006
        *,
4007
        seqid: str,
4008
        start: OptInt = None,
4009
        stop: OptInt = None,
4010
        step: OptInt = None,
4011
    ) -> str:
4012
        """Return ungapped sequence corresponding to seqid as a string.
4013
        start/stop are in sequence coordinates. Excludes gaps."""
4014
        return self.alphabet.from_indices(
6✔
4015
            self.get_seq_array(seqid=seqid, start=start, stop=stop, step=step),
4016
        )
4017

4018
    def get_gapped_seq_str(
6✔
4019
        self,
4020
        *,
4021
        seqid: str,
4022
        start: OptInt = None,
4023
        stop: OptInt = None,
4024
        step: OptInt = None,
4025
    ) -> str:
4026
        """Return sequence corresponding to seqid as a string.
4027
        start/stop are in alignment coordinates. Includes gaps."""
4028
        return self.alphabet.from_indices(
6✔
4029
            self.get_gapped_seq_array(seqid=seqid, start=start, stop=stop, step=step),
4030
        )
4031

4032
    def get_seq_bytes(
6✔
4033
        self,
4034
        *,
4035
        seqid: str,
4036
        start: OptInt = None,
4037
        stop: OptInt = None,
4038
        step: OptInt = None,
4039
    ) -> bytes:
4040
        """Return ungapped sequence corresponding to seqid as a bytes string.
4041
        start/stop are in sequence coordinates. Excludes gaps."""
4042
        return self.get_seq_str(seqid=seqid, start=start, stop=stop, step=step).encode(
6✔
4043
            "utf8",
4044
        )
4045

4046
    def get_gapped_seq_bytes(
6✔
4047
        self,
4048
        *,
4049
        seqid: str,
4050
        start: OptInt = None,
4051
        stop: OptInt = None,
4052
        step: OptInt = None,
4053
    ) -> bytes:
4054
        """Return sequence corresponding to seqid as a bytes string.
4055
        start/stop are in alignment coordinates. Includes gaps."""
4056
        return self.get_gapped_seq_str(
6✔
4057
            seqid=seqid,
4058
            start=start,
4059
            stop=stop,
4060
            step=step,
4061
        ).encode("utf8")
4062

4063
    @extend_docstring_from(AlignedSeqsDataABC.get_ungapped)
6✔
4064
    def get_ungapped(
6✔
4065
        self,
4066
        name_map: dict[str, str],
4067
        start: OptInt = None,
4068
        stop: OptInt = None,
4069
        step: OptInt = None,
4070
    ) -> tuple[dict, dict]:
4071
        # redesign
4072
        # if gaps exist, don't go via gapped seq
4073
        # convert alignment coords into sequence coords using the location.align_to_seq_index function
4074
        # this means we will need to convert coordinates to a plus strand slice
4075
        if (start or 0) < 0 or (stop or 0) < 0 or (step or 1) <= 0:
6✔
4076
            msg = f"{start=}, {stop=}, {step=} not >= 0"
×
4077
            raise ValueError(msg)
×
4078

4079
        seq_array = numpy.empty(
6✔
4080
            (len(name_map), self.align_len),
4081
            dtype=self.alphabet.dtype,
4082
        )
4083
        names = tuple(name_map.values())
6✔
4084
        for i, name in enumerate(names):
6✔
4085
            index = self._name_to_index[name]
6✔
4086
            seq_array[i] = self._gapped[index]
6✔
4087
        seq_array = seq_array[:, start:stop:step]
6✔
4088
        # now exclude gaps and missing
4089
        seqs = {}
6✔
4090
        for i, name in enumerate(names):
6✔
4091
            seq = seq_array[i]
6✔
4092
            indices = seq != self.alphabet.gap_index
6✔
4093
            if self.alphabet.missing_index is not None:
6✔
4094
                indices &= seq != self.alphabet.missing_index
6✔
4095
            seqs[name] = seq[indices]
6✔
4096

4097
        offset = {n: v for n, v in self._offset.items() if n in names}
6✔
4098
        return seqs, {
6✔
4099
            "offset": offset,
4100
            "name_map": name_map,
4101
            "reversed_seqs": self._reversed,
4102
        }
4103

4104
    def add_seqs(
6✔
4105
        self,
4106
        seqs: dict[str, StrORArray],
4107
        force_unique_keys: bool = True,
4108
        offset: dict[str, int] | None = None,
4109
    ) -> AlignedSeqsData:
4110
        """Returns a new AlignedSeqsData object with added sequences.
4111

4112
        Parameters
4113
        ----------
4114
        seqs
4115
            dict of sequences to add {name: seq, ...}
4116
        force_unique_keys
4117
            if True, raises ValueError if any sequence names already exist in the collection
4118
        offset
4119
            dict of offsets relative to for the new sequences.
4120
        """
4121
        if force_unique_keys and any(name in self.names for name in seqs):
6✔
4122
            msg = "One or more sequence names already exist in collection"
6✔
4123
            raise ValueError(msg)
6✔
4124

4125
        new_seq_lens = {len(seq) for seq in seqs.values()}
6✔
4126
        if len(new_seq_lens) != 1 or new_seq_lens.pop() != self.align_len:
6✔
4127
            msg = "All sequences must be the same length as existing sequences"
6✔
4128
            raise ValueError(
6✔
4129
                msg,
4130
            )
4131

4132
        new_seqs = dict(zip(self.names, self._gapped, strict=False))
6✔
4133
        for name, seq in seqs.items():
6✔
4134
            seq = self.alphabet.to_indices(seq, validate=True)
6✔
4135
            seq.flags.writeable = False
6✔
4136
            new_seqs[name] = seq
6✔
4137

4138
        names = tuple(new_seqs.keys())
6✔
4139
        gapped = numpy.empty((len(names), self.align_len), dtype=self.alphabet.dtype)
6✔
4140
        for i, name in enumerate(names):
6✔
4141
            gapped[i] = new_seqs[name]
6✔
4142

4143
        return self.__class__(
6✔
4144
            gapped_seqs=gapped,
4145
            names=names,
4146
            alphabet=self.alphabet,
4147
            offset={**self._offset, **(offset or {})},
4148
            align_len=self.align_len,
4149
        )
4150

4151
    def to_alphabet(
6✔
4152
        self,
4153
        alphabet: c3_alphabet.AlphabetABC,
4154
        check_valid: bool = True,
4155
    ) -> Self:
4156
        """Returns a new AlignedSeqsData object with the same underlying data
4157
        with a new alphabet."""
4158
        if (
6✔
4159
            len(alphabet) == len(self.alphabet)
4160
            and len(
4161
                {
4162
                    (a, b)
4163
                    for a, b in zip(self.alphabet, alphabet, strict=False)
4164
                    if a != b
4165
                },
4166
            )
4167
            == 1
4168
        ):
4169
            # special case where mapping between dna and rna
4170
            return self.__class__(
6✔
4171
                gapped_seqs=self._gapped,
4172
                alphabet=alphabet,
4173
                offset=self._offset,
4174
                align_len=self.align_len,
4175
                names=self.names,
4176
            )
4177

4178
        gapped = numpy.empty(
6✔
4179
            (len(self.names), self.align_len),
4180
            dtype=self.alphabet.dtype,
4181
        )
4182

4183
        for i in range(len(self.names)):
6✔
4184
            seq_data = self._gapped[i]
6✔
4185
            as_new_alpha = self.alphabet.convert_seq_array_to(
6✔
4186
                seq=seq_data,
4187
                alphabet=alphabet,
4188
                check_valid=check_valid,
4189
            )
4190
            gapped[i] = as_new_alpha
6✔
4191

4192
        return self.__class__(
6✔
4193
            gapped_seqs=gapped,
4194
            alphabet=alphabet,
4195
            offset=self._offset,
4196
            names=self.names,
4197
        )
4198

4199
    def get_pos_range(
6✔
4200
        self,
4201
        names: PySeqStr,
4202
        start: OptInt = None,
4203
        stop: OptInt = None,
4204
        step: OptInt = None,
4205
    ) -> numpy.ndarray:
4206
        """returns an array of the selected positions for names."""
4207
        start = start or 0
6✔
4208
        stop = stop or self.align_len
6✔
4209
        step = step or 1
6✔
4210
        if start < 0 or stop < 0 or step < 1:
6✔
4211
            msg = f"{start=}, {stop=}, {step=} not >= 1"
×
4212
            raise ValueError(msg)
×
4213

4214
        indices = tuple(self._name_to_index[name] for name in names)
6✔
4215
        if abs((start - stop) // step) == self.align_len:
6✔
4216
            array_seqs = self._gapped[indices, :]
6✔
4217
        else:
4218
            array_seqs = self._gapped[indices, start:stop:step]
6✔
4219

4220
        return array_seqs
6✔
4221

4222
    def get_positions(
6✔
4223
        self,
4224
        names: PySeqStr,
4225
        positions: typing.Sequence[int] | numpy.ndarray[numpy.integer],
4226
    ) -> numpy.ndarray[numpy.uint8]:
4227
        """returns alignment positions for names
4228

4229
        Parameters
4230
        ----------
4231
        names
4232
            series of sequence names
4233
        positions
4234
            indices lying within self
4235

4236
        Returns
4237
        -------
4238
            2D numpy.array, oriented by sequence
4239

4240
        Raises
4241
        ------
4242
        IndexError if a provided position is negative or
4243
        greater then alignment length.
4244
        """
4245
        if diff := set(names) - set(self.names):
6✔
4246
            msg = f"these names not present {diff}"
×
4247
            raise ValueError(msg)
×
4248

4249
        min_index, max_index = numpy.min(positions), numpy.max(positions)
6✔
4250
        if min_index < 0 or max_index > self.align_len:
6✔
4251
            msg = f"Out of range: {min_index=} and / or {max_index=}"
6✔
4252
            raise IndexError(msg)
6✔
4253

4254
        seq_indices = tuple(self._name_to_index[n] for n in names)
6✔
4255
        return self._gapped[numpy.ix_(seq_indices, positions)]
6✔
4256

4257
    def copy(self, **kwargs) -> Self:
6✔
4258
        """shallow copy of self
4259

4260
        Notes
4261
        -----
4262
        kwargs are passed to constructor and will over-ride existing values
4263
        """
4264
        init_args = {
×
4265
            "gapped_seqs": self._gapped,
4266
            "names": self._names,
4267
            "alphabet": self._alphabet,
4268
            "ungapped_seqs": self._ungapped,
4269
            "gaps": self._gaps,
4270
            "offset": self._offset,
4271
            "align_len": self._align_len,
4272
            "check": False,
4273
            "reversed_seqs": self._reversed,
4274
            **kwargs,
4275
        }
4276

4277
        return self.__class__(**init_args)
×
4278

4279
    def variable_positions(
6✔
4280
        self,
4281
        names: PySeqStr,
4282
        start: OptInt = None,
4283
        stop: OptInt = None,
4284
        step: OptInt = None,
4285
    ) -> numpy.ndarray:
4286
        """returns absolute indices of positions that have more than one state
4287

4288
        Parameters
4289
        ----------
4290
        names
4291
            selected seqids
4292
        start
4293
            absolute start
4294
        stop
4295
            absolute stop
4296
        step
4297
            step
4298

4299
        Returns
4300
        -------
4301
        Absolute indices (as distinct from an index relative to start) of
4302
        variable positions.
4303
        """
4304
        start = start or 0
6✔
4305
        if len(names) < 2:
6✔
4306
            return numpy.array([])
×
4307

4308
        array_seqs = self.get_pos_range(names, start=start, stop=stop, step=step)
6✔
4309
        if array_seqs.size == 0:
6✔
4310
            return numpy.array([])
×
4311

4312
        indices = (array_seqs != array_seqs[0]).any(axis=0)
6✔
4313
        return numpy.where(indices)[0] + start
6✔
4314

4315
    def get_hash(self, seqid: str) -> str | None:
6✔
4316
        """returns hash of seqid"""
4317
        if seqid not in self._hashes:
6✔
4318
            arr = self.get_gapped_seq_array(seqid=seqid)
6✔
4319
            self._hashes[seqid] = array_hash64(arr)
6✔
4320
        return self._hashes[seqid]
6✔
4321

4322

4323
class AlignedDataViewABC(c3_sequence.SeqViewABC):
6✔
4324
    __slots__ = ()
6✔
4325

4326
    @abstractmethod
4327
    def get_seq_view(self) -> c3_sequence.SeqViewABC: ...
4328

4329
    @property
4330
    @abstractmethod
4331
    def map(self) -> IndelMap: ...
4332

4333
    @property
4334
    @abstractmethod
4335
    def slice_record(self) -> c3_sequence.SliceRecordABC: ...
4336

4337
    @property
4338
    @abstractmethod
4339
    def gapped_str_value(self) -> str: ...
4340

4341
    @property
4342
    @abstractmethod
4343
    def gapped_array_value(self) -> numpy.ndarray: ...
4344

4345
    @property
4346
    @abstractmethod
4347
    def gapped_bytes_value(self) -> bytes: ...
4348

4349

4350
class AlignedDataView(c3_sequence.SeqViewABC):
6✔
4351
    """
4352
    A view class for ``AlignedSeqsData``, providing methods for different representations
4353
    of a single sequence.
4354

4355
    Notes
4356
    -----
4357
    ``str_value`` / ``array_value`` are not complemented, but can be reversed. The latter
4358
    is done by the ``Aligned`` object which has a moltype. The ``slice_record`` attribute
4359
    is shared with the containing ``Alignment``.
4360
    """
4361

4362
    __slots__ = (
6✔
4363
        "_offset",
4364
        "_parent_len",
4365
        "_seqid",
4366
        "_slice_record",
4367
        "alphabet",
4368
        "parent",
4369
    )
4370

4371
    def __init__(
6✔
4372
        self,
4373
        *,
4374
        parent: AlignedSeqsDataABC,
4375
        seqid: str,
4376
        alphabet: c3_alphabet.AlphabetABC,
4377
        slice_record: OptSliceRecord = None,
4378
    ) -> None:
4379
        self.parent = parent
6✔
4380
        self._seqid = seqid
6✔
4381
        self.alphabet = alphabet
6✔
4382
        self._parent_len = parent.align_len
6✔
4383
        self._slice_record = (
6✔
4384
            slice_record
4385
            if slice_record is not None
4386
            else c3_sequence.SliceRecord(parent_len=self._parent_len)
4387
        )
4388

4389
    @property
6✔
4390
    def slice_record(self) -> c3_sequence.SliceRecordABC:
6✔
4391
        """the slice record for this view"""
4392
        return self._slice_record
6✔
4393

4394
    @slice_record.setter
6✔
4395
    def slice_record(self, value: c3_sequence.SliceRecordABC) -> None:
6✔
4396
        self._slice_record = value
×
4397

4398
    @property
6✔
4399
    def offset(self) -> int:
6✔
4400
        """the slice offset of this view"""
4401
        return self.slice_record.offset
6✔
4402

4403
    @property
6✔
4404
    def seqid(self) -> str:
6✔
4405
        """the name of the sequence"""
4406
        return self._seqid
6✔
4407

4408
    @property
6✔
4409
    def parent_len(self) -> int:
6✔
4410
        """length of the parent sequence"""
4411
        return self._parent_len
6✔
4412

4413
    @property
6✔
4414
    def map(self) -> IndelMap:
6✔
4415
        """indel map (gaps) for the sequence"""
4416
        imap = self._parent_map()
6✔
4417
        start, stop, step = (
6✔
4418
            self.slice_record.start,
4419
            self.slice_record.stop,
4420
            self.slice_record.step,
4421
        )
4422
        return imap[start:stop:step]
6✔
4423

4424
    def _parent_map(self) -> IndelMap:
6✔
4425
        gap_pos_gap_length = self.parent.get_gaps(self.seqid)
6✔
4426
        if gap_pos_gap_length.size > 0:
6✔
4427
            gap_pos = numpy.array(gap_pos_gap_length[:, 0], dtype=int)
6✔
4428
            cum_gap_lengths = numpy.array(gap_pos_gap_length[:, 1], dtype=int)
6✔
4429
        else:
4430
            gap_pos, cum_gap_lengths = (
6✔
4431
                numpy.array([], dtype=int),
4432
                numpy.array([], dtype=int),
4433
            )
4434
        return IndelMap(
6✔
4435
            gap_pos=gap_pos,
4436
            cum_gap_lengths=cum_gap_lengths,
4437
            parent_length=self.parent.get_seq_length(self.seqid),
4438
        )
4439

4440
    @property
6✔
4441
    def str_value(self) -> str:
6✔
4442
        """returns the string value of the ungapped sequence"""
4443
        return self.alphabet.from_indices(self.array_value)
6✔
4444

4445
    @property
6✔
4446
    def gapped_str_value(self) -> str:
6✔
4447
        """returns the string value of the gapped sequence"""
4448
        return self.alphabet.from_indices(self.gapped_array_value)
6✔
4449

4450
    @property
6✔
4451
    def array_value(self) -> numpy.ndarray:
6✔
4452
        """returns the numpy array of indices for the ungapped sequence"""
4453
        value = self.parent.get_seq_array(
6✔
4454
            seqid=self.seqid,
4455
            start=self.map.get_seq_index(self.slice_record.plus_start),
4456
            stop=self.map.get_seq_index(self.slice_record.plus_stop),
4457
            step=self.map.get_seq_index(self.slice_record.plus_step),
4458
        )
4459
        return value[::-1] if self.slice_record.is_reversed else value
6✔
4460

4461
    @property
6✔
4462
    def gapped_array_value(self) -> numpy.ndarray:
6✔
4463
        """returns the numpy array of indices for the gapped sequence"""
4464
        value = self.parent.get_gapped_seq_array(
6✔
4465
            seqid=self.seqid,
4466
            start=self.slice_record.plus_start,
4467
            stop=self.slice_record.plus_stop,
4468
            step=self.slice_record.plus_step,
4469
        )
4470
        return value[::-1] if self.slice_record.is_reversed else value
6✔
4471

4472
    @property
6✔
4473
    def bytes_value(self) -> bytes:
6✔
4474
        """returns the bytes value of the ungapped sequence"""
4475
        return self.str_value.encode("utf8")
6✔
4476

4477
    @property
6✔
4478
    def gapped_bytes_value(self) -> bytes:
6✔
4479
        """returns the bytes value of the gapped sequence"""
4480
        return self.gapped_str_value.encode("utf8")
6✔
4481

4482
    def __str__(self) -> str:
6✔
4483
        return self.gapped_str_value
6✔
4484

4485
    def __array__(
6✔
4486
        self,
4487
        dtype: numpy.dtype | None = None,
4488
        copy: bool | None = None,
4489
    ) -> numpy.ndarray[int]:
4490
        arr = self.gapped_array_value
6✔
4491
        if dtype:
6✔
4492
            arr = arr.astype(dtype)
×
4493
        return arr
6✔
4494

4495
    def __bytes__(self) -> bytes:
6✔
4496
        return self.gapped_bytes_value
6✔
4497

4498
    def __getitem__(self, segment) -> Self:
6✔
4499
        return self.__class__(
6✔
4500
            parent=self.parent,
4501
            seqid=self.seqid,
4502
            alphabet=self.alphabet,
4503
            slice_record=self.slice_record[segment],
4504
        )
4505

4506
    def __repr__(self) -> str:
6✔
4507
        seq_preview = (
×
4508
            f"{self.parent.get_seq_array(seqid=self.seqid, start=0, stop=10)}..."
4509
            f"{self.parent.get_seq_array(seqid=self.seqid, start=self.parent_len - 5)}"
4510
            if self.parent_len > 15
4511
            else self.parent.get_seq_array(seqid=self.seqid)
4512
        )
4513
        seq_preview = self.alphabet.from_indices(seq_preview)
×
4514
        return (
×
4515
            f"{self.__class__.__name__}(seqid={self.seqid!r}, map={self.map!r}, parent={seq_preview!r}, "
4516
            f"slice_record={self.slice_record.__repr__()})"
4517
        )
4518

4519
    def parent_coords(
6✔
4520
        self, *, seq_coords: bool = False, apply_offset: bool = False
4521
    ) -> tuple[str, int, int, int]:
4522
        """returns seqid, start, stop, strand on the parent
4523

4524
        Parameters
4525
        ----------
4526
        seq_coords
4527
            if True, parent is the ungapped sequence
4528
        apply_offset
4529
            if True and seq_coords, adds annotation offset from parent
4530
        """
4531
        strand = -1 if self.is_reversed else 1
6✔
4532
        if not seq_coords:
6✔
4533
            return (
6✔
4534
                self.seqid,
4535
                self.slice_record.parent_start,
4536
                self.slice_record.parent_stop,
4537
                strand,
4538
            )
4539

4540
        # AlignedDataView.parent_coords uses it's indelmap, etc..
4541
        # to return the necessary coordinates
4542

4543
        # we want the coordinates on the parent sequence, which means we
4544
        # need to use the parent's IndelMap for findings the correct indices.
4545
        parent_map = self._parent_map()
6✔
4546
        start = parent_map.get_seq_index(self.slice_record.parent_start)
6✔
4547
        stop = parent_map.get_seq_index(self.slice_record.parent_stop)
6✔
4548
        offset = self.parent_offset if apply_offset else 0
6✔
4549

4550
        return self.seqid, start + offset, stop + offset, strand
6✔
4551

4552
    def copy(self, sliced: bool = False) -> Self:
6✔
4553
        """just returns self"""
4554
        return self
×
4555

4556
    def _get_init_kwargs(self) -> dict:
6✔
4557
        return {
×
4558
            "parent": self.parent,
4559
            "seqid": self.seqid,
4560
            "alphabet": self.alphabet,
4561
            "slice_record": self.slice_record,
4562
        }
4563

4564
    def get_seq_view(self) -> c3_sequence.SeqViewABC:
6✔
4565
        """returns view of ungapped sequence data for seqid"""
4566
        # we want the parent coordinates in sequence coordinates
4567
        # parent_coords does not account for the stride
4568
        seqid, start, stop, _ = self.parent_coords(seq_coords=True, apply_offset=False)
6✔
4569
        parent_len = self.parent.get_seq_length(seqid)
6✔
4570
        sr = c3_sequence.SliceRecord(
6✔
4571
            start=start,
4572
            stop=stop,
4573
            parent_len=parent_len,
4574
        )[:: self.slice_record.step]
4575

4576
        return SeqDataView(
6✔
4577
            parent=self.parent,
4578
            seqid=seqid,
4579
            alphabet=self.alphabet,
4580
            parent_len=parent_len,
4581
            slice_record=sr,
4582
        )
4583

4584
    @property
6✔
4585
    def is_reversed(self) -> bool:
6✔
4586
        """whether the sliced view is reversed relative to the parent"""
4587
        if self.seqid in self.parent.reversed_seqs:
6✔
4588
            # seqid is reversed relative to everything else
4589
            # hence is_reversed is the opposite of the slice record
4590
            return not self.slice_record.is_reversed
6✔
4591
        return self.slice_record.is_reversed
6✔
4592

4593

4594
def make_gap_filter(template, gap_fraction, gap_run):
6✔
4595
    """Returns f(seq) -> True if no gap runs and acceptable gap fraction.
4596

4597
    Calculations relative to template.
4598
    gap_run = number of consecutive gaps allowed in either the template or seq
4599
    gap_fraction = fraction of positions that either have a gap in the template
4600
        but not in the seq or in the seq but not in the template
4601
    NOTE: template and seq must both be ArraySequence objects.
4602
    """
4603
    template_gaps = numpy.array(template.gap_vector())
6✔
4604

4605
    def result(seq) -> bool:
6✔
4606
        """Returns True if seq adhers to the gap threshold and gap fraction."""
4607
        seq_gaps = numpy.array(seq.gap_vector())
6✔
4608
        # check if gap amount bad
4609
        if sum(seq_gaps != template_gaps) / float(len(seq)) > gap_fraction:
6✔
4610
            return False
6✔
4611
        # check if gap runs bad
4612
        return not (
6✔
4613
            b"\x01" * gap_run
4614
            in numpy.logical_and(seq_gaps, numpy.logical_not(template_gaps))
4615
            .astype(numpy.uint8)
4616
            .tobytes()
4617
            or b"\x01" * gap_run
4618
            in numpy.logical_and(template_gaps, numpy.logical_not(seq_gaps))
4619
            .astype(numpy.uint8)
4620
            .tobytes()
4621
        )
4622

4623
    return result
6✔
4624

4625

4626
class _IndexableSeqs:
6✔
4627
    """container that is created by SequenceCollection and Alignment instances"""
4628

4629
    def __init__(
6✔
4630
        self,
4631
        parent: SequenceCollection | Alignment,
4632
        make_seq: typing.Callable[[str], c3_sequence.Sequence | Aligned],
4633
    ) -> None:
4634
        """
4635
        Parameters
4636
        ----------
4637
        parent
4638
            either a SequenceCollection or Alignment instance
4639
        make_seq
4640
            method on the parent that creates the correct object type when given a seqid
4641
        """
4642
        self.parent = parent
6✔
4643
        self._make_seq = make_seq
6✔
4644

4645
    @singledispatchmethod
6✔
4646
    def __getitem__(
6✔
4647
        self,
4648
        key: str | int | slice,
4649
    ) -> c3_sequence.Sequence | Aligned:
4650
        msg = f"indexing not supported for {type(key)}, try .take_seqs()"
×
4651
        raise TypeError(msg)
×
4652

4653
    @__getitem__.register
6✔
4654
    def _(self, key: int) -> c3_sequence.Sequence | Aligned:
6✔
4655
        return self[self.parent.names[key]]
6✔
4656

4657
    @__getitem__.register
6✔
4658
    def _(self, key: str) -> c3_sequence.Sequence | Aligned:
6✔
4659
        return self._make_seq(key)
6✔
4660

4661
    def __repr__(self) -> str:
6✔
4662
        one_seq = self[self.parent.names[0]]
6✔
4663
        return f"({one_seq!r}, + {self.parent.num_seqs - 1} seqs)"
6✔
4664

4665
    def __len__(self) -> int:
6✔
4666
        return self.parent.num_seqs
6✔
4667

4668
    def __iter__(self):
6✔
4669
        for name in self.parent.names:
6✔
4670
            yield self._make_seq(name)
6✔
4671

4672

4673
class Alignment(SequenceCollection):
6✔
4674
    """A collection of aligned sequences.
4675

4676
    Notes
4677
    -----
4678
    Should be constructed using ``make_aligned_seqs()``.
4679
    """
4680

4681
    def __init__(
6✔
4682
        self,
4683
        seqs_data: AlignedSeqsDataABC,  # seqs_data
4684
        slice_record: OptSliceRecord = None,
4685
        **kwargs,
4686
    ) -> None:
4687
        super().__init__(seqs_data=seqs_data, **kwargs)
6✔
4688
        self._slice_record = (
6✔
4689
            slice_record
4690
            if slice_record is not None
4691
            else c3_sequence.SliceRecord(parent_len=self._seqs_data.align_len)
4692
        )
4693
        self._array_seqs = None
6✔
4694

4695
    def _post_init(self) -> None:
6✔
4696
        self._seqs = _IndexableSeqs(self, make_seq=self._make_aligned)
6✔
4697

4698
    def __eq__(self, other: object) -> bool:
6✔
4699
        return super().__eq__(other) and self._slice_record == other._slice_record
6✔
4700

4701
    def __ne__(self, other: object) -> bool:
6✔
4702
        return not self == other
6✔
4703

4704
    @property
6✔
4705
    def storage(self) -> AlignedSeqsDataABC:
6✔
4706
        """the aligned sequence storage instance of the collection"""
4707
        return self._seqs_data
6✔
4708

4709
    @storage.setter
6✔
4710
    def storage(self, value: object) -> None:
6✔
4711
        # storage cannot be set after initialisation
4712
        msg = "storage cannot be set after initialisation"
6✔
4713
        raise TypeError(msg)
6✔
4714

4715
    @property
6✔
4716
    def modified(self) -> bool:
6✔
4717
        """collection is a modification of underlying storage"""
4718
        # include changed seq names?
4719
        sr = self._slice_record
6✔
4720
        changed_slice = sr.start != 0 or len(sr) != self.storage.align_len
6✔
4721
        return any(
6✔
4722
            [
4723
                changed_slice,
4724
                set(self.name_map.values()) != set(self.storage.names),
4725
                self.name_map.keys() != set(self.name_map.values()),
4726
            ]
4727
        )
4728

4729
    def _get_init_kwargs(self) -> dict:
6✔
4730
        """returns the kwargs needed to re-instantiate the object"""
4731
        return {
6✔
4732
            "seqs_data": self._seqs_data,
4733
            "moltype": self.moltype,
4734
            "name_map": dict(self._name_map),
4735
            "info": self.info.copy(),
4736
            "annotation_db": self._annotation_db,
4737
            "slice_record": self._slice_record,
4738
            "source": self.source,
4739
        }
4740

4741
    @singledispatchmethod
6✔
4742
    def __getitem__(self, index) -> Self:
6✔
4743
        msg = f"__getitem__ not implemented for {type(index)}"
6✔
4744
        raise NotImplementedError(msg)
6✔
4745

4746
    @__getitem__.register
6✔
4747
    def _(self, index: str) -> Self:
6✔
4748
        return self.seqs[index]
6✔
4749

4750
    @__getitem__.register
6✔
4751
    def _(self, index: int) -> Self:
6✔
4752
        new_slice = self._slice_record[index]
6✔
4753
        kwargs = self._get_init_kwargs()
6✔
4754
        kwargs["slice_record"] = new_slice
6✔
4755
        return self.__class__(**kwargs)
6✔
4756

4757
    @__getitem__.register
6✔
4758
    def _(self, index: slice) -> Self:
6✔
4759
        new_slice = self._slice_record[index]
6✔
4760
        kwargs = self._get_init_kwargs()
6✔
4761
        kwargs["slice_record"] = new_slice
6✔
4762
        if new_slice.plus_step > 1:
6✔
4763
            # we retain the annotation database only for "simple" slices
4764
            kwargs.pop("annotation_db", None)
6✔
4765

4766
        return self.__class__(**kwargs)
6✔
4767

4768
    @__getitem__.register
6✔
4769
    def _(self, index: FeatureMap) -> Self:
6✔
4770
        return self._mapped(index)
6✔
4771

4772
    @__getitem__.register
6✔
4773
    def _(self, index: Feature):
6✔
4774
        if index.parent is not self:
6✔
4775
            msg = "This feature applied to the wrong sequence / alignment"
×
4776
            raise ValueError(msg)
×
4777
        return index.get_slice()
6✔
4778

4779
    def __repr__(self) -> str:
6✔
4780
        seqs = []
6✔
4781
        limit = 10
6✔
4782
        delimiter = ""
6✔
4783
        for count, name in enumerate(self.names):
6✔
4784
            if count == 3:
6✔
4785
                seqs.append("...")
6✔
4786
                break
6✔
4787
            elts = list(str(self.seqs[name])[: limit + 1])
6✔
4788
            if len(elts) > limit:
6✔
4789
                elts[-1] = "..."
6✔
4790
            seqs.append(f"{name}[{delimiter.join(elts)}]")
6✔
4791
        seqs = ", ".join(seqs)
6✔
4792

4793
        return f"{len(self.names)} x {len(self)} {self.moltype.label} alignment: {seqs}"
6✔
4794

4795
    def __len__(self) -> int:
6✔
4796
        return len(self._slice_record)
6✔
4797

4798
    def __array__(
6✔
4799
        self,
4800
        dtype: numpy.dtype | None = None,
4801
        copy: bool | None = None,
4802
    ) -> numpy.ndarray[int]:
4803
        return self.array_seqs
6✔
4804

4805
    def _make_aligned(self, seqid: str) -> Aligned:
6✔
4806
        adv = self._seqs_data.get_view(
6✔
4807
            self._name_map.get(seqid, seqid),
4808
            slice_record=self._slice_record,
4809
        )
4810
        aligned = Aligned(data=adv, moltype=self.moltype, name=seqid)
6✔
4811
        aligned.annotation_db = self._annotation_db
6✔
4812
        return aligned
6✔
4813

4814
    @property
6✔
4815
    def positions(self) -> list[list[str]]:
6✔
4816
        # refactor: design
4817
        # possibly rename to str_positions since we have array_positions
4818
        from_indices = self.moltype.most_degen_alphabet().from_indices
6✔
4819
        return [list(from_indices(pos)) for pos in self.array_positions]
6✔
4820

4821
    @property
6✔
4822
    def array_seqs(self) -> numpy.ndarray:
6✔
4823
        """Returns a numpy array of sequences, axis 0 is seqs in order
4824
        corresponding to names"""
4825
        if self._array_seqs is None:
6✔
4826
            names = [self._name_map[n] for n in self.names]
6✔
4827
            # create the dest array dim
4828
            arr_seqs = self._seqs_data.get_pos_range(
6✔
4829
                names=names,
4830
                start=self._slice_record.plus_start,
4831
                stop=self._slice_record.plus_stop,
4832
                step=self._slice_record.plus_step,
4833
            )
4834
            if self.moltype.is_nucleic and self._slice_record.is_reversed:
6✔
4835
                rev_complement = self.moltype.rc
6✔
4836
                arr_seqs = arr_seqs.copy()
6✔
4837
                arr_seqs.flags.writeable = True
6✔
4838
                for i in range(arr_seqs.shape[0]):
6✔
4839
                    arr_seqs[i] = rev_complement(arr_seqs[i])
6✔
4840

4841
            arr_seqs.flags.writeable = False  # make sure data is immutable
6✔
4842
            self._array_seqs = arr_seqs
6✔
4843

4844
        return self._array_seqs
6✔
4845

4846
    @property
6✔
4847
    def array_positions(self) -> numpy.ndarray:
6✔
4848
        """Returns a numpy array of positions, axis 0 is alignment positions
4849
        columns in order corresponding to names."""
4850
        return self.array_seqs.T
6✔
4851

4852
    def get_seq(
6✔
4853
        self,
4854
        seqname: str,
4855
        copy_annotations: bool = False,
4856
    ) -> c3_sequence.Sequence:
4857
        """Return a Sequence object for the specified seqname.
4858

4859
        Parameters
4860
        ----------
4861
        seqname
4862
            name of the sequence to return
4863
        copy_annotations
4864
            if True, only the annotations for the specified sequence are copied
4865
            to the annotation database of the Sequence object which is decoupled
4866
            from this collection. If False, the connection to this collections db
4867
            is retained.
4868
        """
4869
        seq = self.seqs[seqname].seq
6✔
4870
        if copy_annotations and self._annotation_db:
6✔
4871
            # we need to copy the sequence too to break the link to self.annotation_db
4872
            seq = seq.copy(exclude_annotations=True)
×
4873
            seq.annotation_db = type(self.annotation_db)()
×
4874
            seq.annotation_db.update(annot_db=self.annotation_db, seqids=seqname)
×
4875
            return seq
×
4876

4877
        seq.annotation_db = self._annotation_db
6✔
4878
        return seq
6✔
4879

4880
    def get_gapped_seq(
6✔
4881
        self,
4882
        seqname: str,
4883
        recode_gaps: bool = False,
4884
    ) -> c3_sequence.Sequence:
4885
        """Return a gapped Sequence object for the specified seqname.
4886

4887

4888
        Parameters
4889
        ----------
4890
        seqname
4891
            sequence name
4892
        recode_gaps
4893
            if True, gap characters are replaced by the most general
4894
            ambiguity code, e.g. N for DNA and RNA
4895

4896
        Notes
4897
        -----
4898
        This method breaks the connection to the annotation database.
4899
        """
4900
        s = self.seqs[seqname].gapped_seq
6✔
4901
        if recode_gaps:
6✔
4902
            s = str(s)
6✔
4903
            non_ambig = list(self.moltype)
6✔
4904
            ambig = self.moltype.degenerate_from_seq(non_ambig)
6✔
4905
            for gapchar in self.moltype.gaps:
6✔
4906
                s = s.replace(gapchar, ambig)
6✔
4907

4908
        return self.moltype.make_seq(seq=s, name=seqname)
6✔
4909

4910
    def rc(self):
6✔
4911
        """Returns the reverse complement of all sequences in the alignment.
4912
        A synonym for reverse_complement.
4913
        """
4914
        init_kwargs = self._get_init_kwargs()
6✔
4915
        init_kwargs["slice_record"] = self._slice_record[::-1]
6✔
4916
        return self.__class__(**init_kwargs)
6✔
4917

4918
    def alignment_quality(self, app_name: str = "ic_score", **kwargs):
6✔
4919
        """
4920
        Computes the alignment quality using the indicated app
4921

4922
        Parameters
4923
        ----------
4924
        app_name
4925
            name of an alignment score calculating app, e.g. 'ic_score',
4926
            'cogent3_score', 'sp_score'
4927

4928
        kwargs
4929
            keyword arguments to be passed to the app. Use
4930
            ``cogent3.app_help(app_name)`` to see the available options.
4931

4932
        Returns
4933
        -------
4934
        float or a NotCompleted instance if the score could not be computed
4935
        """
4936
        app = cogent3.get_app(app_name, **kwargs)
6✔
4937
        return app(self)
6✔
4938

4939
    def rename_seqs(self, renamer: Callable[[str], str]):
6✔
4940
        """Returns new alignment with renamed sequences."""
4941
        new = super().rename_seqs(renamer)
6✔
4942

4943
        if self._array_seqs is not None:
6✔
4944
            new._array_seqs = self._array_seqs
6✔
4945

4946
        return new
6✔
4947

4948
    def iter_positions(
6✔
4949
        self,
4950
        pos_order: list | None = None,
4951
    ) -> typing.Iterator[list, list, list]:
4952
        """Iterates over positions in the alignment, in order.
4953

4954
        Parameters
4955
        ----------
4956
        pos_order
4957
            list of indices specifying the column order. If None, the
4958
            positions are iterated in order.
4959

4960
        Returns
4961
        -------
4962
        yields lists of elemenets for each position (column) in the alignment
4963
        """
4964
        # refactor: array
4965
        # this could also iter columns of indices as a numpy array - could be an optional arg
4966
        # refactor: add motif_length argument
4967
        pos_order = pos_order or range(len(self))
6✔
4968
        for pos in pos_order:
6✔
4969
            yield [str(self[seq][pos]) for seq in self.names]
6✔
4970

4971
    def get_position_indices(
6✔
4972
        self,
4973
        f: Callable[[str], bool],
4974
        negate: bool = False,
4975
    ) -> list[int]:
4976
        """Returns list of column indices for which f(col) is True.
4977

4978
        Parameters
4979
        ----------
4980
        f
4981
          function that returns true/false given an alignment position
4982
        negate
4983
          if True, not f() is used
4984
        """
4985
        # refactor:
4986
        # type hint for f
4987
        # implement native
4988
        new_f = negate_condition(f) if negate else f
6✔
4989

4990
        # refactor: design
4991
        # use array_positions here
4992
        return [i for i, col in enumerate(self.positions) if new_f(col)]
6✔
4993

4994
    def take_positions(
6✔
4995
        self,
4996
        cols: list[int] | numpy.ndarray[int],
4997
        negate: bool = False,
4998
    ) -> Self:
4999
        """Returns new Alignment containing only specified positions.
5000

5001
        Parameters
5002
        ----------
5003
        cols
5004
            list of column indices to keep
5005
        negate
5006
            if True, all columns except those in cols are kept
5007
        """
5008
        # refactor: array - use array operations throughout method
5009
        if negate:
6✔
5010
            col_lookup = dict.fromkeys(cols)
6✔
5011
            cols = [i for i in range(len(self)) if i not in col_lookup]
6✔
5012

5013
        new_data = {
6✔
5014
            self.name_map[aligned.name]: numpy.array(aligned).take(cols)
5015
            for aligned in self.seqs
5016
        }
5017
        seqs_data = self._seqs_data.from_seqs(
6✔
5018
            data=new_data,
5019
            alphabet=self.moltype.most_degen_alphabet(),
5020
        )
5021
        kwargs = self._get_init_kwargs()
6✔
5022
        kwargs["seqs_data"] = seqs_data
6✔
5023
        kwargs.pop("annotation_db", None)
6✔
5024
        kwargs.pop("slice_record", None)
6✔
5025
        return self.__class__(**kwargs)
6✔
5026

5027
    def take_positions_if(self, f: Callable[[str], bool], negate: bool = False) -> Self:
6✔
5028
        """Returns new Alignment containing cols where f(col) is True."""
5029
        return self.take_positions(self.get_position_indices(f, negate=negate))
6✔
5030

5031
    def get_gap_array(self, include_ambiguity: bool = True) -> numpy.ndarray:
6✔
5032
        """returns bool array with gap state True, False otherwise
5033

5034
        Parameters
5035
        ----------
5036
        include_ambiguity
5037
            if True, ambiguity characters that include the gap state are
5038
            included
5039
        """
5040
        alpha = self.moltype.most_degen_alphabet()
6✔
5041
        gapped = self.array_seqs == alpha.gap_index
6✔
5042
        if include_ambiguity:
6✔
5043
            gapped = gapped | (self.array_seqs == alpha.missing_index)
6✔
5044
        return gapped
6✔
5045

5046
    def iupac_consensus(self, allow_gap: bool = True) -> str:
6✔
5047
        """Returns string containing IUPAC consensus sequence of the alignment."""
5048
        exclude = set() if allow_gap else set(self.moltype.gaps)
6✔
5049
        consensus = []
6✔
5050
        degen = self.moltype.degenerate_from_seq
6✔
5051
        for col in self.iter_positions():
6✔
5052
            col = set(col) - exclude
6✔
5053
            consensus.append(degen("".join(col)))
6✔
5054
        return "".join(consensus)
6✔
5055

5056
    def majority_consensus(self) -> c3_sequence.Sequence:
6✔
5057
        """Returns consensus sequence containing most frequent item at each
5058
        position."""
5059
        states = []
6✔
5060
        data = zip(*map(str, self.seqs), strict=False)
6✔
5061
        for pos in data:
6✔
5062
            pos = CategoryCounter(pos)
6✔
5063
            states.append(pos.mode)
6✔
5064

5065
        return self.moltype.make_seq(seq="".join(states))
6✔
5066

5067
    def counts_per_pos(
6✔
5068
        self,
5069
        motif_length: int = 1,
5070
        include_ambiguity: bool = False,
5071
        allow_gap: bool = False,
5072
        warn: bool = False,
5073
    ) -> DictArray:
5074
        """return DictArray of counts per position
5075

5076
        Parameters
5077
        ----------
5078
        motif_length
5079
            number of elements per character.
5080
        include_ambiguity
5081
            if True, motifs containing ambiguous characters from the seq moltype
5082
            are included. No expansion of those is attempted.
5083
        allow_gap
5084
            if True, motifs containing a gap character are included.
5085
        warn
5086
            warns if motif_length > 1 and alignment trimmed to produce
5087
            motif columns
5088
        """
5089
        # refactor: performance
5090
        # use self.variable_positions and a numba decorated
5091
        # function for counting k-mers the latter should allow returning the
5092
        # first allowed state for when position is not variable
5093

5094
        align_len = len(self._slice_record)
6✔
5095
        length = (align_len // motif_length) * motif_length
6✔
5096
        if warn and align_len != length:
6✔
5097
            warnings.warn(f"trimmed {align_len - length}", UserWarning, stacklevel=2)
×
5098

5099
        data = list(self.to_dict().values())
6✔
5100
        alpha = self.moltype.alphabet.get_kmer_alphabet(motif_length)
6✔
5101
        all_motifs = set()
6✔
5102
        exclude_chars = set()
6✔
5103
        if not allow_gap:
6✔
5104
            exclude_chars.update(self.moltype.gap)
6✔
5105

5106
        if not include_ambiguity and self.moltype.degen_alphabet:
6✔
5107
            ambigs = [c for c, v in self.moltype.ambiguities.items() if len(v) > 1]
6✔
5108
            exclude_chars.update(ambigs)
6✔
5109

5110
        result = []
6✔
5111
        for i in range(0, align_len - motif_length + 1, motif_length):
6✔
5112
            counts = CategoryCounter([s[i : i + motif_length] for s in data])
6✔
5113
            all_motifs.update(list(counts))
6✔
5114
            result.append(counts)
6✔
5115

5116
        if all_motifs:
6✔
5117
            alpha += tuple(sorted(set(alpha) ^ all_motifs))
6✔
5118

5119
        if exclude_chars:
6✔
5120
            # this additional clause is required for the bytes moltype
5121
            # That moltype includes '-' as a character
5122
            alpha = [m for m in alpha if not (set(m) & exclude_chars)]
6✔
5123

5124
        for i, counts in enumerate(result):
6✔
5125
            result[i] = counts.tolist(alpha)
6✔
5126

5127
        return MotifCountsArray(result, alpha)
6✔
5128

5129
    def probs_per_pos(
6✔
5130
        self,
5131
        motif_length: int = 1,
5132
        include_ambiguity: bool = False,
5133
        allow_gap: bool = False,
5134
        warn: bool = False,
5135
    ) -> MotifFreqsArray:
5136
        """returns MotifFreqsArray per position"""
5137
        counts = self.counts_per_pos(
6✔
5138
            motif_length=motif_length,
5139
            include_ambiguity=include_ambiguity,
5140
            allow_gap=allow_gap,
5141
            warn=warn,
5142
        )
5143
        return counts.to_freq_array()
6✔
5144

5145
    def entropy_per_pos(
6✔
5146
        self,
5147
        motif_length: int = 1,
5148
        include_ambiguity: bool = False,
5149
        allow_gap: bool = False,
5150
        warn: bool = False,
5151
    ) -> numpy.ndarray:
5152
        """returns shannon entropy per position"""
5153
        # if the current alignment is very long, we chunk this
5154
        # in case a backend is being used that stores contents on
5155
        # disk by sequence
5156
        probs = self.probs_per_pos(
6✔
5157
            motif_length=motif_length,
5158
            include_ambiguity=include_ambiguity,
5159
            allow_gap=allow_gap,
5160
            warn=warn,
5161
        )
5162
        return probs.entropy()
6✔
5163

5164
    def counts_per_seq(
6✔
5165
        self,
5166
        motif_length: int = 1,
5167
        include_ambiguity: bool = False,
5168
        allow_gap: bool = False,
5169
        exclude_unobserved: bool = False,
5170
        warn: bool = False,
5171
    ) -> MotifCountsArray:
5172
        """counts of non-overlapping motifs per sequence
5173

5174
        Parameters
5175
        ----------
5176
        motif_length
5177
            number of elements per character.
5178
        include_ambiguity
5179
            if True, motifs containing ambiguous characters
5180
            from the seq moltype are included. No expansion of those is attempted.
5181
        allow_gap
5182
            if True, motifs containing a gap character are included.
5183
        exclude_unobserved
5184
            if False, all canonical states included
5185
        warn
5186
            warns if motif_length > 1 and alignment trimmed to produce
5187
            motif columns
5188
        """
5189
        length = (len(self) // motif_length) * motif_length
6✔
5190
        if not length:
6✔
5191
            motifs = list(self.moltype)
6✔
5192
            counts = numpy.zeros((len(self.names), len(motifs)), dtype=int)
6✔
5193
            return MotifCountsArray(counts, motifs, row_indices=self.names)
6✔
5194

5195
        if warn and len(self) != length:
6✔
5196
            warnings.warn(f"trimmed {len(self) - length}", UserWarning, stacklevel=2)
×
5197

5198
        counts = []
6✔
5199
        motifs = set()
6✔
5200
        for name in self.names:
6✔
5201
            seq = self.get_gapped_seq(name)
6✔
5202
            c = seq.counts(
6✔
5203
                motif_length=motif_length,
5204
                include_ambiguity=include_ambiguity,
5205
                allow_gap=allow_gap,
5206
            )
5207
            motifs.update(c.keys())
6✔
5208
            counts.append(c)
6✔
5209

5210
        # if type motifs not same as type element in moltype
5211
        if not exclude_unobserved:
6✔
5212
            motifs.update(self.moltype.alphabet.get_kmer_alphabet(motif_length))
6✔
5213

5214
        motifs = sorted(motifs)
6✔
5215
        if not motifs:
6✔
5216
            return None
6✔
5217

5218
        for i, c in enumerate(counts):
6✔
5219
            counts[i] = c.tolist(motifs)
6✔
5220
        return MotifCountsArray(counts, motifs, row_indices=self.names)
6✔
5221

5222
    def probs_per_seq(
6✔
5223
        self,
5224
        motif_length: int = 1,
5225
        include_ambiguity: bool = False,
5226
        allow_gap: bool = False,
5227
        exclude_unobserved: bool = False,
5228
        warn: bool = False,
5229
    ) -> MotifFreqsArray:
5230
        """return MotifFreqsArray per sequence
5231

5232
        Parameters
5233
        ----------
5234
        motif_length
5235
            number of characters per tuple.
5236
        include_ambiguity
5237
            if True, motifs containing ambiguous characters
5238
            from the seq moltype are included. No expansion of those is attempted.
5239
        allow_gap
5240
            if True, motifs containing a gap character are included.
5241
        exclude_unobserved
5242
            if True, unobserved motif combinations are excluded.
5243
        warn
5244
            warns if motif_length > 1 and alignment trimmed to produce
5245
            motif columns
5246
        """
5247

5248
        counts = self.counts_per_seq(
6✔
5249
            motif_length=motif_length,
5250
            include_ambiguity=include_ambiguity,
5251
            allow_gap=allow_gap,
5252
            exclude_unobserved=exclude_unobserved,
5253
            warn=warn,
5254
        )
5255
        return None if counts is None else counts.to_freq_array()
6✔
5256

5257
    def entropy_per_seq(
6✔
5258
        self,
5259
        motif_length: int = 1,
5260
        include_ambiguity: bool = False,
5261
        allow_gap: bool = False,
5262
        exclude_unobserved: bool = True,
5263
        warn: bool = False,
5264
    ) -> numpy.ndarray:
5265
        """returns the Shannon entropy per sequence
5266

5267
        Parameters
5268
        ----------
5269
        motif_length
5270
            number of characters per tuple.
5271
        include_ambiguity
5272
            if True, motifs containing ambiguous characters
5273
            from the seq moltype are included. No expansion of those is attempted.
5274
        allow_gap
5275
            if True, motifs containing a gap character are included.
5276
        exclude_unobserved
5277
            if True, unobserved motif combinations are excluded.
5278
        warn
5279
            warns if motif_length > 1 and alignment trimmed to produce
5280
            motif columns
5281

5282
        Notes
5283
        -----
5284
        For motif_length > 1, it's advisable to specify exclude_unobserved=True,
5285
        this avoids unnecessary calculations.
5286
        """
5287

5288
        probs = self.probs_per_seq(
6✔
5289
            motif_length=motif_length,
5290
            include_ambiguity=include_ambiguity,
5291
            allow_gap=allow_gap,
5292
            exclude_unobserved=exclude_unobserved,
5293
            warn=warn,
5294
        )
5295
        return None if probs is None else probs.entropy()
6✔
5296

5297
    def count_gaps_per_pos(self, include_ambiguity: bool = True) -> DictArray:
6✔
5298
        """return counts of gaps per position as a DictArray
5299

5300
        Parameters
5301
        ----------
5302
        include_ambiguity
5303
            if True, ambiguity characters that include the gap state are
5304
            included
5305
        """
5306
        gap_array = self.get_gap_array(include_ambiguity=include_ambiguity)
6✔
5307
        darr = DictArrayTemplate(range(len(self)))
6✔
5308

5309
        result = gap_array.sum(axis=0)
6✔
5310
        return darr.wrap(result)
6✔
5311

5312
    def count_gaps_per_seq(
6✔
5313
        self,
5314
        induced_by: bool = False,
5315
        unique: bool = False,
5316
        include_ambiguity: bool = True,
5317
        drawable: bool = False,
5318
    ) -> DictArray:
5319
        """return counts of gaps per sequence as a DictArray
5320

5321
        Parameters
5322
        ----------
5323
        induced_by
5324
            a gapped column is considered to be induced by a seq if the seq
5325
            has a non-gap character in that column.
5326
        unique
5327
            count is limited to gaps uniquely induced by each sequence
5328
        include_ambiguity
5329
            if True, ambiguity characters that include the gap state are
5330
            included
5331
        drawable
5332
            if True, resulting object is capable of plotting data via specified
5333
            plot type 'bar', 'box' or 'violin'
5334
        """
5335
        from cogent3.draw.drawable import Drawable
6✔
5336

5337
        gap_array = self.get_gap_array(include_ambiguity=include_ambiguity)
6✔
5338
        darr = DictArrayTemplate(self.names)
6✔
5339

5340
        if unique:
6✔
5341
            # we identify cols with a single non-gap character
5342
            gap_cols = gap_array.sum(axis=0) == self.num_seqs - 1
6✔
5343
            gap_array = gap_array[:, gap_cols] == False  # noqa
6✔
5344
        elif induced_by:
6✔
5345
            # identify all columns with gap opposite
5346
            gap_cols = gap_array.sum(axis=0) > 0
6✔
5347
            gap_array = gap_array[:, gap_cols] == False  # noqa
6✔
5348
        else:
5349
            gap_cols = gap_array.sum(axis=0) > 0
6✔
5350
            gap_array = gap_array[:, gap_cols]
6✔
5351

5352
        result = gap_array.sum(axis=1)
6✔
5353
        result = darr.wrap(result)
6✔
5354
        if drawable:
6✔
5355
            drawable = drawable.lower()
6✔
5356
            trace_name = pathlib.Path(self.source).name if self.source else None
6✔
5357
            draw = Drawable("Gaps Per Sequence", showlegend=False)
6✔
5358
            draw.layout |= {"yaxis": {"title": "Gap counts"}}
6✔
5359
            if drawable == "bar":
6✔
5360
                trace = UnionDict(type="bar", y=result.array, x=self.names)
6✔
5361
            else:
5362
                trace = UnionDict(
6✔
5363
                    type=drawable,
5364
                    y=result.array,
5365
                    text=self.names,
5366
                    name=trace_name,
5367
                )
5368

5369
            draw.add_trace(trace)
6✔
5370
            result = draw.bound_to(result)
6✔
5371

5372
        return result
6✔
5373

5374
    def count_ambiguous_per_seq(self) -> DictArray:
6✔
5375
        """Return the counts of ambiguous characters per sequence as a DictArray."""
5376

5377
        gap_index = self.moltype.most_degen_alphabet().gap_index
6✔
5378
        ambigs_pos = self.array_seqs > gap_index
6✔
5379
        ambigs = ambigs_pos.sum(axis=1)
6✔
5380

5381
        return DictArray.from_array_names(ambigs, self.names)
6✔
5382

5383
    def variable_positions(
6✔
5384
        self,
5385
        include_gap_motif: bool = True,
5386
        include_ambiguity: bool = False,
5387
        motif_length: int = 1,
5388
    ) -> tuple[int]:
5389
        """Return a list of variable position indexes.
5390

5391
        Parameters
5392
        ----------
5393
        include_gap_motif
5394
            if False, sequences with a gap motif in a column are ignored.
5395
        include_ambiguity
5396
            if True, all states are considered.
5397
        motif_length
5398
            if any position within a motif is variable, the entire motif is
5399
            considered variable.
5400

5401
        Returns
5402
        -------
5403
        tuple of integers, if motif_length > 1, the returned positions are
5404
        motif_length long sequential indices.
5405

5406
        Notes
5407
        -----
5408
        Truncates alignment to be modulo motif_length.
5409
        """
5410
        align_len = len(self) // motif_length * motif_length
6✔
5411
        # columns is 2D array with alignment columns as rows
5412
        pos = self.storage.variable_positions(
6✔
5413
            list(self.name_map.values()),
5414
            start=self._slice_record.plus_start,
5415
            stop=min(self._slice_record.plus_stop, align_len),
5416
            step=self._slice_record.plus_step,
5417
        )
5418
        if not pos.size:
6✔
5419
            return ()
6✔
5420

5421
        alpha = self.storage.alphabet
6✔
5422
        gap_index = alpha.gap_index or len(alpha)
6✔
5423
        missing_index = alpha.missing_index or len(alpha)
6✔
5424
        if include_gap_motif and include_ambiguity:
6✔
5425
            # allow all states
5426
            func = None
6✔
5427
        elif include_gap_motif and self.moltype.gapped_missing_alphabet:
6✔
5428
            # allow canonical, gap, missing
5429
            func = _var_pos_canonical_or_gap
6✔
5430
            kwargs = {
6✔
5431
                "gap_index": gap_index,
5432
                "missing_index": missing_index,
5433
            }
5434
        elif include_ambiguity and self.moltype.degen_alphabet:
6✔
5435
            # anything but a gap
5436
            func = _var_pos_not_gap
6✔
5437
            kwargs = {
6✔
5438
                "gap_index": gap_index,
5439
            }
5440
        else:
5441
            # canonical only
5442
            func = _var_pos_canonical
6✔
5443
            kwargs = {
6✔
5444
                "gap_index": gap_index,
5445
            }
5446

5447
        indices = numpy.zeros(align_len, dtype=bool)
6✔
5448
        if func is None:
6✔
5449
            indices[pos] = True
6✔
5450
        else:
5451
            array_seqs = self.storage.get_positions(list(self.name_map.values()), pos)
6✔
5452
            indices[pos] = func(array_seqs, **kwargs)
6✔
5453

5454
        if self._slice_record.is_reversed:
6✔
5455
            # for reverse complement alignments
5456
            # because we have a bool vector entire alignment length
5457
            # just reversing the order is all we need to do since the
5458
            # numpy.where statement will return positions in this new order
5459
            indices = indices[::-1]
6✔
5460

5461
        if motif_length > 1:
6✔
5462
            var_pos = indices.reshape(-1, motif_length).any(axis=1).repeat(motif_length)
6✔
5463
        else:
5464
            var_pos = indices
6✔
5465

5466
        var_pos = numpy.where(var_pos)[0]
6✔
5467
        return tuple(var_pos.tolist())
6✔
5468

5469
    def omit_bad_seqs(self, quantile: OptFloat = None):
6✔
5470
        """Returns new alignment without sequences with a number of uniquely
5471
        introduced gaps exceeding quantile
5472

5473
        Uses count_gaps_per_seq(unique=True) to obtain the counts of gaps
5474
        uniquely introduced by a sequence. The cutoff is the quantile of
5475
        this distribution.
5476

5477
        Parameters
5478
        ----------
5479
        quantile
5480
            sequences whose unique gap count is in a quantile larger than this
5481
            cutoff are excluded. The default quantile is (num_seqs - 1) / num_seqs
5482
        """
5483
        gap_counts = self.count_gaps_per_seq(unique=True)
6✔
5484
        quantile = quantile or (self.num_seqs - 1) / self.num_seqs
6✔
5485
        cutoff = numpy.quantile(gap_counts.array, quantile)
6✔
5486
        names = [name for name, count in gap_counts.items() if count <= cutoff]
6✔
5487
        return self.take_seqs(names)
6✔
5488

5489
    def degap(self, storage_backend: str | None = None, **kwargs) -> SequenceCollection:
6✔
5490
        """returns collection sequences without gaps or missing characters.
5491

5492
        Parameters
5493
        ----------
5494
        storage_backend
5495
            name of the storage backend to use for the SeqsData object, defaults to
5496
            cogent3 builtin.
5497
        kwargs
5498
            keyword arguments for the storage driver
5499

5500
        Notes
5501
        -----
5502
        The returned collection will not retain an annotation_db if present.
5503
        """
5504
        # because SequenceCollection does not track slice operations, we need
5505
        # to apply any slice record to the underlying data
5506
        sr = self._slice_record
6✔
5507
        data, kw = self._seqs_data.get_ungapped(
6✔
5508
            name_map=self._name_map,
5509
            start=sr.plus_start,
5510
            stop=sr.plus_stop,
5511
            step=sr.plus_step,
5512
        )
5513
        kwargs = kw | kwargs
6✔
5514
        # the SeqsData classes will return the data corresponding to the slice,
5515
        # however, will not complement the data if the step is negative. We do
5516
        # this here.
5517
        rev_complement = self.moltype.rc
6✔
5518
        data = (
6✔
5519
            {name: rev_complement(seq) for name, seq in data.items()}
5520
            if sr.step < 0
5521
            else data
5522
        )
5523
        kwargs["annotation_db"] = self._annotation_db
6✔
5524
        kwargs["storage_backend"] = storage_backend
6✔
5525
        return make_unaligned_seqs(data, moltype=self.moltype, info=self.info, **kwargs)
6✔
5526

5527
    def get_degapped_relative_to(self, name: str) -> Self:
6✔
5528
        """Remove all columns with gaps in sequence with given name.
5529

5530
        Parameters
5531
        ----------
5532
        name
5533
            sequence name
5534

5535
        Notes
5536
        -----
5537
        The returned alignment will not retain an annotation_db if present.
5538
        """
5539

5540
        if name not in self.names:
6✔
5541
            msg = f"Alignment missing sequence named {name!r}"
6✔
5542
            raise ValueError(msg)
6✔
5543

5544
        gapindex = self.moltype.most_degen_alphabet().gap_index
6✔
5545
        seqindex = self.names.index(name)
6✔
5546
        indices = self.array_seqs[seqindex] != gapindex
6✔
5547
        new = self.array_seqs[:, indices]
6✔
5548

5549
        new_seq_data = self._seqs_data.from_names_and_array(
6✔
5550
            names=self._name_map.values(),
5551
            data=new,
5552
            alphabet=self.moltype.most_degen_alphabet(),
5553
        )
5554
        kwargs = self._get_init_kwargs()
6✔
5555
        kwargs["seqs_data"] = new_seq_data
6✔
5556
        kwargs.pop("annotation_db", None)
6✔
5557
        kwargs.pop("slice_record", None)
6✔
5558
        return self.__class__(**kwargs)
6✔
5559

5560
    def matching_ref(self, ref_name: str, gap_fraction: float, gap_run: int) -> Self:
6✔
5561
        """Returns new alignment with seqs well aligned with a reference.
5562

5563
        Parameters
5564
        ----------
5565
        ref_name
5566
            name of the sequence to use as the reference
5567
        gap_fraction
5568
            fraction of positions that either have a gap in the
5569
            template but not in the seq or in the seq but not in the template
5570
        gap_run
5571
            number of consecutive gaps tolerated in query relative to
5572
            sequence or sequence relative to query
5573
        """
5574
        template = self.seqs[ref_name]
6✔
5575
        gap_filter = make_gap_filter(template, gap_fraction, gap_run)
6✔
5576
        return self.take_seqs_if(gap_filter)
6✔
5577

5578
    def sliding_windows(
6✔
5579
        self,
5580
        window: int,
5581
        step: int,
5582
        start: OptInt = None,
5583
        end: OptInt = None,
5584
    ) -> typing.Generator[Self, None, None]:
5585
        """Generator yielding new alignments of given length and interval.
5586

5587
        Parameters
5588
        ----------
5589
        window
5590
            The length of each returned alignment.
5591
        step
5592
            The interval between the start of the successive windows.
5593
        start
5594
            first window start position
5595
        end
5596
            last window start position
5597
        """
5598
        start = start or 0
6✔
5599
        end = [end, len(self) - window + 1][end is None]
6✔
5600
        end = min(len(self) - window + 1, end)
6✔
5601
        if start < end and len(self) - end >= window - 1:
6✔
5602
            for pos in range(start, end, step):
6✔
5603
                yield self[pos : pos + window]
6✔
5604

5605
    def gapped_by_map(self, keep: FeatureMap, **kwargs) -> Self:
6✔
5606
        # refactor: docstring
5607
        # TODO: kath, not explicitly tested
5608
        seqs = {}
×
5609
        for seq in self.seqs:
×
5610
            selected = seq[keep]
×
5611
            seqs[self.name_map[seq.name]] = numpy.array(selected.gapped_seq)
×
5612

5613
        seqs_data = self._seqs_data.from_seqs(
×
5614
            data=seqs,
5615
            alphabet=self.moltype.most_degen_alphabet(),
5616
        )
5617
        init_kwargs = self._get_init_kwargs()
×
5618
        init_kwargs.pop("annotation_db", None)
×
5619
        init_kwargs |= kwargs
×
5620
        init_kwargs["seqs_data"] = seqs_data
×
5621
        init_kwargs.pop("slice_record", None)
×
5622
        return self.__class__(**init_kwargs)
×
5623

5624
    def filtered(
6✔
5625
        self,
5626
        predicate: typing.Callable[[Self], bool],
5627
        motif_length: int = 1,
5628
        drop_remainder: bool = True,
5629
        **kwargs,
5630
    ) -> Self:
5631
        """The alignment positions where predicate(column) is true.
5632

5633
        Parameters
5634
        ----------
5635
        predicate
5636
            a callback function that takes an tuple of motifs and returns
5637
            True/False
5638
        motif_length
5639
            length of the motifs the sequences should be split  into, eg. 3 for
5640
            filtering aligned codons.
5641
        drop_remainder
5642
            If length is not modulo motif_length, allow dropping the terminal
5643
            remaining columns
5644
        """
5645
        # refactor: type hint for predicate
5646
        length = len(self)
6✔
5647
        drop = length % motif_length
6✔
5648
        if drop != 0 and not drop_remainder:
6✔
5649
            msg = f"aligned length not divisible by motif_length={motif_length}"
6✔
5650
            raise ValueError(
6✔
5651
                msg,
5652
            )
5653
        length -= drop
6✔
5654
        kept = numpy.zeros(length, dtype=bool)
6✔
5655
        for pos in range(0, length, motif_length):
6✔
5656
            seqs = [seq[pos : pos + motif_length] for seq in self.seqs]
6✔
5657
            if predicate(seqs):
6✔
5658
                kept[pos : pos + motif_length] = True
6✔
5659

5660
        indices = numpy.where(kept)[0]
6✔
5661

5662
        return self.take_positions(indices.tolist())
6✔
5663

5664
    def no_degenerates(self, motif_length: int = 1, allow_gap: bool = False) -> Self:
6✔
5665
        """returns new alignment without degenerate characters
5666

5667
        Parameters
5668
        ----------
5669
        motif_length
5670
            sequences are segmented into units of this size and the segments are
5671
            excluded if they contain degenerate characters.
5672
        allow_gap
5673
            whether gaps are allowed or whether they are treated as a degenerate
5674
            character (latter is default, as most evolutionary modelling treats
5675
            gaps as N).
5676
        """
5677
        if self.moltype.degen_alphabet is None:
6✔
5678
            msg = (
6✔
5679
                f"Invalid MolType={self.moltype.label} (no degenerate characters), "
5680
                "create the alignment using DNA, RNA or PROTEIN"
5681
            )
5682
            raise c3_moltype.MolTypeError(
6✔
5683
                msg,
5684
            )
5685

5686
        chars = len(self.moltype)
6✔
5687

5688
        array_pos = self.array_positions
6✔
5689
        # by design, char alphabets are organised such that the canonical
5690
        # characters always occur first, followed by gap, then ambiguity
5691
        # characters. so we can define a cutoff as follows:
5692
        cutoff = chars + 1 if allow_gap else chars
6✔
5693
        indices = (array_pos < cutoff).all(axis=1)
6✔
5694

5695
        if motif_length > 1:
6✔
5696
            num_motif = len(self) // motif_length
6✔
5697

5698
            if remainder := len(self) % motif_length:
6✔
5699
                indices = indices[:-remainder]
6✔
5700
                array_pos = array_pos[:-remainder]
6✔
5701

5702
            motif_valid = indices.reshape(num_motif, motif_length).all(axis=1).flatten()
6✔
5703
            indices = numpy.repeat(motif_valid, motif_length)
6✔
5704

5705
        selected = array_pos[indices].T
6✔
5706

5707
        aligned_seqs_data = self._seqs_data.from_names_and_array(
6✔
5708
            names=self._name_map.values(),
5709
            data=selected,
5710
            alphabet=self.moltype.most_degen_alphabet(),
5711
        )
5712
        kwargs = self._get_init_kwargs()
6✔
5713
        kwargs["seqs_data"] = aligned_seqs_data
6✔
5714
        kwargs.pop("annotation_db", None)
6✔
5715
        kwargs.pop("slice_record", None)
6✔
5716
        return self.__class__(**kwargs)
6✔
5717

5718
    def _omit_gap_pos_single(
6✔
5719
        self,
5720
        gap_index: int,
5721
        missing_index: OptInt,
5722
        allowed_num: int,
5723
    ) -> numpy.ndarray[bool]:
5724
        # for motif_length == 1
5725
        indices = numpy.empty(len(self), dtype=bool)
6✔
5726
        for i, col in enumerate(self.array_seqs.T):
6✔
5727
            indices[i] = _gap_ok_vector_single(
6✔
5728
                col,
5729
                gap_index,
5730
                missing_index,
5731
                allowed_num,
5732
            )
5733
        return indices
6✔
5734

5735
    def _omit_gap_pos_multi(
6✔
5736
        self,
5737
        gap_index: int,
5738
        missing_index: OptInt,
5739
        allowed_num: int,
5740
        motif_length: int,
5741
    ) -> numpy.ndarray[bool]:
5742
        # for motif_length > 1
5743
        num_motifs = len(self) // motif_length
6✔
5744
        indices = numpy.empty(num_motifs * motif_length, dtype=bool)
6✔
5745
        for i in range(0, len(self), motif_length):
6✔
5746
            col = self.array_seqs.T[i : i + motif_length].T
6✔
5747
            ok = _gap_ok_vector_multi(
6✔
5748
                col,
5749
                gap_index,
5750
                missing_index,
5751
                motif_length,
5752
                allowed_num,
5753
            )
5754
            indices[i : i + motif_length] = ok
6✔
5755
        return indices
6✔
5756

5757
    def omit_gap_pos(
6✔
5758
        self,
5759
        allowed_gap_frac: float | None = None,
5760
        motif_length: int = 1,
5761
    ):
5762
        """Returns new alignment where all cols (motifs) have <= allowed_gap_frac gaps.
5763

5764
        Parameters
5765
        ----------
5766
        allowed_gap_frac
5767
            specifies proportion of gaps is allowed in each column. Set to 0 to
5768
            exclude columns with any gaps, 1 to include all columns. Default is
5769
            None which is equivalent to (num_seqs-1)/num_seqs and leads to
5770
            elimination of columns that are only gaps.
5771
        motif_length
5772
            set's the "column" width, e.g. setting to 3 corresponds to codons.
5773
            A motif that includes a gap at any position is included in the
5774
            counting.
5775
        """
5776
        alpha = self.moltype.most_degen_alphabet()
6✔
5777
        gap_index = alpha.gap_index
6✔
5778
        missing_index = alpha.missing_index
6✔
5779
        if not gap_index and not missing_index:
6✔
5780
            return self
6✔
5781

5782
        allowed_num = (
6✔
5783
            self.num_seqs - 1
5784
            if allowed_gap_frac is None
5785
            else int(numpy.floor(self.num_seqs * allowed_gap_frac))
5786
        )
5787

5788
        # we are assuming gap and missing data have same value on other strand
5789
        positions = self.array_positions
6✔
5790
        if motif_length == 1:
6✔
5791
            indices = self._omit_gap_pos_single(gap_index, missing_index, allowed_num)
6✔
5792
        else:
5793
            positions = positions[: motif_length * (len(positions) // motif_length)]
6✔
5794
            indices = self._omit_gap_pos_multi(
6✔
5795
                gap_index,
5796
                missing_index,
5797
                allowed_num,
5798
                motif_length,
5799
            )
5800
        selected = positions[indices, :].T
6✔
5801

5802
        aligned_seqs_data = self._seqs_data.from_names_and_array(
6✔
5803
            names=self._name_map.values(),
5804
            data=selected,
5805
            alphabet=alpha,
5806
        )
5807
        kwargs = self._get_init_kwargs()
6✔
5808
        kwargs["seqs_data"] = aligned_seqs_data
6✔
5809
        # slice record now needs to be reset since we likely have disjoint positions
5810
        kwargs.pop("slice_record", None)
6✔
5811
        return self.__class__(**kwargs)
6✔
5812

5813
    def has_terminal_stop(
6✔
5814
        self, gc: c3_genetic_code.GeneticCodeChoiceType = 1, strict: bool = False
5815
    ) -> bool:
5816
        """Returns True if any sequence has a terminal stop codon.
5817

5818
        Parameters
5819
        ----------
5820
        gc
5821
            valid input to cogent3.get_code(), a genetic code object, number
5822
            or name
5823
        strict
5824
            If True, raises an exception if a seq length not divisible by 3
5825
        """
5826
        for seq_name in self.names:
6✔
5827
            seq = self.seqs[seq_name].seq
6✔
5828
            if seq.has_terminal_stop(gc=gc, strict=strict):
6✔
5829
                return True
6✔
5830
        return False
6✔
5831

5832
    def sample(
6✔
5833
        self,
5834
        *,
5835
        n: int | None = None,
5836
        with_replacement: bool = False,
5837
        motif_length: int = 1,
5838
        randint: typing.Callable[
5839
            [int, int | None, int | None], numpy.ndarray
5840
        ] = numpy.random.randint,
5841
        permutation: typing.Callable[
5842
            [numpy.ndarray], numpy.ndarray
5843
        ] = numpy.random.permutation,
5844
    ) -> Self:
5845
        """Returns random sample of positions from self, e.g. to bootstrap.
5846

5847
        Parameters
5848
        ----------
5849
        n
5850
            number of positions to sample. If None, all positions are sampled.
5851
        with_replacement
5852
            if True, samples with replacement.
5853
        motif_length
5854
            number of positions to sample as a single motif.
5855
        randint
5856
            random number generator, default is numpy.randint
5857
        permutation
5858
            function to generate a random permutation of positions, default is
5859
            numpy.permutation
5860

5861
        Notes
5862
        -----
5863
        By default (resampling all positions without replacement), generates
5864
        a permutation of the positions of the alignment.
5865

5866
        Setting with_replacement to True and otherwise leaving parameters as
5867
        defaults generates a standard bootstrap resampling of the alignment.
5868
        """
5869
        # refactor: type hint for randint, permutation
5870
        # refactor: array
5871
        #  Given array_pos property, it will be efficient to generate random
5872
        # indices and use numpy.take() using that. In the case of motif_length
5873
        # != 1, the total number of positions is just len(self) // motif_length.
5874
        # Having produced that, those indices can be scaled back up, or the
5875
        # numpy array reshaped.
5876

5877
        population_size = len(self) // motif_length
6✔
5878
        if not with_replacement and n and n > population_size:
6✔
5879
            msg = f"cannot sample without replacement when {n=} > {population_size=}"
6✔
5880
            raise ValueError(msg)
6✔
5881

5882
        n = n or population_size
6✔
5883

5884
        if with_replacement:
6✔
5885
            locations = randint(0, population_size, n)
6✔
5886
        else:
5887
            locations = permutation(population_size)[:n]
6✔
5888

5889
        if motif_length == 1:
6✔
5890
            positions = locations
6✔
5891
        else:
5892
            positions = numpy.empty(n * motif_length, dtype=int)
6✔
5893
            for i, loc in enumerate(locations):
6✔
5894
                positions[i * motif_length : (i + 1) * motif_length] = range(
6✔
5895
                    loc * motif_length,
5896
                    (loc + 1) * motif_length,
5897
                )
5898

5899
        return self.take_positions(positions)
6✔
5900

5901
    def distance_matrix(
6✔
5902
        self,
5903
        calc: str = "pdist",
5904
        drop_invalid: bool = False,
5905
        parallel: bool = False,
5906
    ):
5907
        """Returns pairwise distances between sequences.
5908

5909
        Parameters
5910
        ----------
5911
        calc
5912
            a pairwise distance calculator name. Presently only
5913
            'pdist', 'jc69', 'tn93', 'hamming', 'paralinear' are supported.
5914
        drop_invalid
5915
            If True, sequences for which a pairwise distance could not be
5916
            calculated are excluded. If False, an ArithmeticError is raised if
5917
            a distance could not be computed on observed data.
5918
        """
5919
        from cogent3.evolve.pairwise_distance_numba import get_distance_calculator
6✔
5920

5921
        calc = get_distance_calculator(calc)
6✔
5922
        try:
6✔
5923
            result = calc(self, invalid_raises=not drop_invalid, parallel=parallel)
6✔
5924
        except ArithmeticError as e:
6✔
5925
            msg = "not all pairwise distances could be computed, try drop_invalid=True"
6✔
5926
            raise ArithmeticError(msg) from e
6✔
5927

5928
        if drop_invalid:
6✔
5929
            result = result.drop_invalid()
6✔
5930

5931
        return result
6✔
5932

5933
    def quick_tree(
6✔
5934
        self,
5935
        calc: str = "pdist",
5936
        drop_invalid: bool = False,
5937
        parallel: bool = False,
5938
        use_hook: str | None = None,
5939
    ) -> PhyloNode:
5940
        """Returns a phylogenetic tree.
5941

5942
        Parameters
5943
        ----------
5944
        calc
5945
            a pairwise distance calculator or name of one. For options see
5946
            cogent3.evolve.fast_distance.available_distances
5947
        drop_invalid
5948
            If True, sequences for which a pairwise distance could not be
5949
            calculated are excluded. If False, an ArithmeticError is raised if
5950
            a distance could not be computed on observed data.
5951
        parallel
5952
            parallel execution of distance calculations
5953
        use_hook
5954
            name of a third-party package that implements the quick_tree
5955
            hook. If not specified, defaults to the first available hook or
5956
            the cogent3 quick_tree() app. To force default, set
5957
            use_hook="cogent3".
5958

5959
        Returns
5960
        -------
5961
        a phylogenetic tree
5962
        """
5963
        dm = self.distance_matrix(
6✔
5964
            calc=calc,
5965
            drop_invalid=drop_invalid,
5966
            parallel=parallel,
5967
        )
5968
        return dm.quick_tree(use_hook=use_hook)
6✔
5969

5970
    def trim_stop_codons(
6✔
5971
        self,
5972
        gc: c3_genetic_code.GeneticCodeChoiceType = 1,
5973
        strict: bool = False,
5974
        **kwargs,
5975
    ) -> Self:
5976
        # refactor: array
5977
        if not self.has_terminal_stop(gc=gc, strict=strict):
6✔
5978
            return self
6✔
5979

5980
        # define a regex for finding stop codons followed by terminal gaps
5981
        gc = c3_genetic_code.get_code(gc)
6✔
5982
        gaps = "".join(self.moltype.gaps)
6✔
5983
        pattern = f"({'|'.join(gc['*'])})[{gaps}]*$"
6✔
5984
        terminal_stop = re.compile(pattern)
6✔
5985

5986
        data = self.to_dict()
6✔
5987
        result = {}
6✔
5988
        for name, seq in data.items():
6✔
5989
            if match := terminal_stop.search(seq):
6✔
5990
                diff = len(seq) - match.start()
6✔
5991
                seq = terminal_stop.sub("-" * diff, seq)
6✔
5992

5993
            result[self.name_map[name]] = seq
6✔
5994

5995
        seqs_data = self._seqs_data.from_seqs(
6✔
5996
            data=result,
5997
            alphabet=self.moltype.most_degen_alphabet(),
5998
        )
5999
        init_kwargs = self._get_init_kwargs()
6✔
6000
        init_kwargs["seqs_data"] = seqs_data
6✔
6001
        init_kwargs.pop("slice_record", None)
6✔
6002
        init_kwargs |= kwargs
6✔
6003
        return self.__class__(**init_kwargs)
6✔
6004

6005
    @extend_docstring_from(SequenceCollection.get_translation)
6✔
6006
    def get_translation(
6✔
6007
        self,
6008
        gc: c3_genetic_code.GeneticCodeChoiceType = 1,
6009
        incomplete_ok: bool = False,
6010
        include_stop: bool = False,
6011
        trim_stop: bool = True,
6012
        **kwargs,
6013
    ) -> Self:
6014
        if not self.moltype.is_nucleic:
6✔
6015
            msg = f"moltype must be a DNA/RNA, not {self.moltype.name!r}"
6✔
6016
            raise c3_moltype.MolTypeError(msg)
6✔
6017

6018
        if not trim_stop or include_stop:
6✔
6019
            seqs = self
6✔
6020
        else:
6021
            seqs = self.trim_stop_codons(gc=gc, strict=not incomplete_ok)
6✔
6022

6023
        translated = {}
6✔
6024
        for seqname in seqs.names:
6✔
6025
            seq = seqs.get_gapped_seq(seqname)
6✔
6026
            pep = seq.get_translation(
6✔
6027
                gc,
6028
                incomplete_ok=incomplete_ok,
6029
                include_stop=include_stop,
6030
                trim_stop=trim_stop,
6031
            )
6032
            translated[self.name_map[seqname]] = numpy.array(pep)
6✔
6033

6034
        pep_moltype = c3_moltype.get_moltype(
6✔
6035
            "protein_with_stop" if include_stop else "protein",
6036
        )
6037
        seqs_data = self._seqs_data.from_seqs(
6✔
6038
            data=translated,
6039
            alphabet=pep_moltype.most_degen_alphabet(),
6040
            offset=None,
6041
        )
6042
        return self.__class__(
6✔
6043
            seqs_data=seqs_data,
6044
            moltype=pep_moltype,
6045
            name_map=self._name_map,
6046
            info=self.info,
6047
            source=self.source,
6048
            **kwargs,
6049
        )
6050

6051
    def make_feature(
6✔
6052
        self,
6053
        *,
6054
        feature: FeatureDataType,
6055
        on_alignment: OptBool = None,
6056
    ) -> Feature[Alignment]:
6057
        """
6058
        create a feature on named sequence, or on the alignment itself
6059

6060
        Parameters
6061
        ----------
6062
        feature
6063
            a dict with all the necessary data rto construct a feature
6064
        on_alignment
6065
            the feature is in alignment coordinates, incompatible with setting
6066
            'seqid'. Set to True if 'seqid' not provided.
6067

6068
        Returns
6069
        -------
6070
        Feature
6071

6072
        Raises
6073
        ------
6074
        ValueError if define a 'seqid' not on alignment or use 'seqid' and
6075
        on_alignment.
6076

6077
        Notes
6078
        -----
6079
        To get a feature AND add it to annotation_db, use add_feature().
6080
        """
6081
        if on_alignment is None:
6✔
6082
            on_alignment = feature.pop("on_alignment", None)
6✔
6083

6084
        if not on_alignment and feature["seqid"]:
6✔
6085
            return self.seqs[feature["seqid"]].make_feature(feature, self)
6✔
6086

6087
        feature["seqid"] = feature.get("seqid", None)
6✔
6088
        # there's no sequence to bind to, the feature is directly on self
6089
        revd = Strand.from_value(feature.pop("strand", None)) is Strand.MINUS
6✔
6090
        feature["strand"] = Strand.MINUS.value if revd else Strand.PLUS.value
6✔
6091
        fmap = FeatureMap.from_locations(
6✔
6092
            locations=feature.pop("spans"),
6093
            parent_length=len(self),
6094
        )
6095
        if revd:
6✔
6096
            fmap = fmap.nucleic_reversed()
6✔
6097
        return Feature(parent=self, map=fmap, **feature)
6✔
6098

6099
    def add_feature(
6✔
6100
        self,
6101
        *,
6102
        biotype: str,
6103
        name: str,
6104
        spans: list[tuple[int, int]],
6105
        seqid: OptStr = None,
6106
        parent_id: OptStr = None,
6107
        strand: str = "+",
6108
        on_alignment: OptBool = None,
6109
    ) -> Feature[Alignment]:
6110
        """
6111
        add feature on named sequence, or on the alignment itself
6112

6113
        Parameters
6114
        ----------
6115
        seqid
6116
            sequence name, incompatible with on_alignment
6117
        parent_id
6118
            name of the parent feature
6119
        biotype
6120
            biological type, e.g. CDS
6121
        name
6122
            name of the feature
6123
        spans
6124
            plus strand coordinates of feature
6125
        strand
6126
            '+' (default) or '-'
6127
        on_alignment
6128
            the feature is in alignment coordinates, incompatible with setting
6129
            seqid. Set to True if seqid not provided.
6130

6131
        Returns
6132
        -------
6133
        Feature
6134

6135
        Raises
6136
        ------
6137
        ValueError if define a seqid not on alignment or use seqid and
6138
        on_alignment.
6139
        """
6140
        if seqid and on_alignment is None:
6✔
6141
            on_alignment = False
6✔
6142
        elif not on_alignment:
6✔
6143
            on_alignment = on_alignment is None
6✔
6144

6145
        if seqid and on_alignment:
6✔
6146
            msg = "seqid and on_alignment are incomatible"
×
6147
            raise ValueError(msg)
×
6148

6149
        if seqid and seqid not in self.names:
6✔
6150
            msg = f"unknown {seqid=}"
6✔
6151
            raise ValueError(msg)
6✔
6152

6153
        feature = {k: v for k, v in locals().items() if k != "self"}
6✔
6154
        feature["strand"] = Strand.from_value(strand).value
6✔
6155
        # property ensures db is created
6156
        self.annotation_db.add_feature(**feature)
6✔
6157
        for discard in ("on_alignment", "parent_id"):
6✔
6158
            feature.pop(discard, None)
6✔
6159
        return self.make_feature(feature=feature, on_alignment=on_alignment)
6✔
6160

6161
    def _get_seq_features(
6✔
6162
        self,
6163
        *,
6164
        seqid: str | None = None,
6165
        biotype: str | None = None,
6166
        name: str | None = None,
6167
        allow_partial: bool = False,
6168
    ) -> Iterator[Feature[Alignment]]:
6169
        """yields Feature instances
6170

6171
        Parameters
6172
        ----------
6173
        seqid
6174
            limit search to features on this named sequence, defaults to search all
6175
        biotype
6176
            biotype of the feature, e.g. CDS, gene
6177
        name
6178
            name of the feature
6179
        allow_partial
6180
            allow features partially overlaping self
6181

6182
        Notes
6183
        -----
6184
        When dealing with a nucleic acid moltype, the returned features will
6185
        yield a sequence segment that is consistently oriented irrespective
6186
        of strand of the current instance.
6187
        """
6188
        if not self._annotation_db:
6✔
6189
            return None
×
6190

6191
        seqid_to_seqname = {v: k for k, v in self._name_map.items()}
6✔
6192

6193
        seqids = [seqid] if isinstance(seqid, str) else seqid
6✔
6194
        if seqids is None:
6✔
6195
            seqids = tuple(seqid_to_seqname)
6✔
6196
        elif set(seqids) & set(self.names):
6✔
6197
            # we've been given seq names, convert to parent names
6198
            seqids = [self.seqs[seqid].parent_coordinates()[0] for seqid in seqids]
6✔
6199
        elif not (seqids and set(seqids) <= seqid_to_seqname.keys()):
6✔
6200
            msg = f"unknown {seqid=}"
6✔
6201
            raise ValueError(msg)
6✔
6202

6203
        for seqid in seqids:
6✔
6204
            seqname = seqid_to_seqname[seqid]
6✔
6205
            seq = self.seqs[seqname]
6✔
6206
            # we use parent seqid
6207
            parent_id, start, stop, _ = seq.parent_coordinates(apply_offset=False)
6✔
6208
            # we get the annotation offset from storage
6209
            # because we need it to adjust the returned feature spans
6210
            # to the alignment coordinates
6211
            offset = self.storage.offset.get(seqid, 0)
6✔
6212

6213
            for feature in self.annotation_db.get_features_matching(
6✔
6214
                seqid=parent_id,
6215
                biotype=biotype,
6216
                name=name,
6217
                on_alignment=False,
6218
                allow_partial=allow_partial,
6219
                start=start + offset,
6220
                stop=stop + offset,
6221
            ):
6222
                if offset:
6✔
6223
                    feature["spans"] = (numpy.array(feature["spans"]) - offset).tolist()
6✔
6224
                # passing self only used when self is an Alignment
6225
                yield seq.make_feature(feature, self)
6✔
6226

6227
    def get_features(
6✔
6228
        self,
6229
        *,
6230
        seqid: str | None = None,
6231
        biotype: str | None = None,
6232
        name: str | None = None,
6233
        on_alignment: bool | None = None,
6234
        allow_partial: bool = False,
6235
    ) -> Iterator[Feature[Alignment]]:
6236
        """yields Feature instances
6237

6238
        Parameters
6239
        ----------
6240
        seqid
6241
            limit search to features on this named sequence, defaults to search all
6242
        biotype
6243
            biotype of the feature, e.g. CDS, gene
6244
        name
6245
            name of the feature
6246
        on_alignment
6247
            limit query to features on Alignment, ignores sequences. Ignored on
6248
            SequenceCollection instances.
6249
        allow_partial
6250
            allow features partially overlaping self
6251

6252
        Notes
6253
        -----
6254
        When dealing with a nucleic acid moltype, the returned features will
6255
        yield a sequence segment that is consistently oriented irrespective
6256
        of strand of the current instance.
6257
        """
6258
        if not self._annotation_db or not len(self._annotation_db):
6✔
6259
            return None
6✔
6260

6261
        # we only do on-alignment in here
6262
        if not on_alignment:
6✔
6263
            local_vars = locals()
6✔
6264
            kwargs = {k: v for k, v in local_vars.items() if k != "self"}
6✔
6265
            kwargs.pop("on_alignment")
6✔
6266
            yield from self._get_seq_features(**kwargs)
6✔
6267

6268
        if on_alignment == False:  # noqa
6✔
6269
            return
×
6270

6271
        seq_map = None
6✔
6272
        for feature in self.annotation_db.get_features_matching(
6✔
6273
            biotype=biotype,
6274
            name=name,
6275
            on_alignment=on_alignment,
6276
            allow_partial=allow_partial,
6277
        ):
6278
            if feature["seqid"]:
6✔
6279
                continue
6✔
6280
            on_al = feature.pop("on_alignment", on_alignment)
6✔
6281
            if feature["seqid"]:
6✔
6282
                msg = f"{on_alignment=} {feature=}"
×
6283
                raise RuntimeError(msg)
×
6284
            if seq_map is None:
6✔
6285
                seq_map = self.seqs[0].map.to_feature_map()
6✔
6286
                *_, strand = self.seqs[0].seq.parent_coordinates()
6✔
6287
            else:
6288
                strand = feature.pop("strand", None)
×
6289

6290
            spans = numpy.array(feature["spans"])
6✔
6291
            spans = seq_map.relative_position(spans)
6✔
6292
            feature["spans"] = spans.tolist()
6✔
6293
            # and if i've been reversed...?
6294
            feature["strand"] = Strand.from_value(strand).value
6✔
6295
            yield self.make_feature(feature=feature, on_alignment=on_al)
6✔
6296

6297
    def get_projected_feature(self, *, seqid: str, feature: Feature) -> Feature:
6✔
6298
        """returns an alignment feature projected onto the seqid sequence
6299

6300
        Parameters
6301
        ----------
6302
        seqid
6303
            name of the sequence to project the feature onto
6304
        feature
6305
            a Feature, bound to self, that will be projected
6306

6307
        Returns
6308
        -------
6309
        a new Feature bound to seqid
6310

6311
        Notes
6312
        -----
6313
        The alignment coordinates of feature are converted into the seqid
6314
        sequence coordinates and the object is bound to that sequence.
6315

6316
        The feature is added to the annotation_db.
6317
        """
6318
        target_aligned = self.seqs[seqid]
6✔
6319
        if feature.parent is not self:
6✔
6320
            msg = "Feature does not belong to this alignment"
×
6321
            raise ValueError(msg)
×
6322
        result = feature.remapped_to(target_aligned.seq, target_aligned.map)
6✔
6323
        # property ensures db is created
6324
        self.annotation_db.add_feature(**feature.to_dict())
6✔
6325
        return result
6✔
6326

6327
    def get_projected_features(self, *, seqid: str, **kwargs) -> list[Feature]:
6✔
6328
        """projects all features from other sequences onto seqid"""
6329
        annots = []
6✔
6330
        for name in self.names:
6✔
6331
            if name == seqid:
6✔
6332
                continue
6✔
6333
            annots.extend(list(self.get_features(seqid=name, **kwargs)))
6✔
6334
        return [self.get_projected_feature(seqid=seqid, feature=a) for a in annots]
6✔
6335

6336
    def get_drawables(
6✔
6337
        self,
6338
        *,
6339
        biotype: Optional[str, typing.Iterable[str]] = None,
6340
    ) -> dict:
6341
        """returns a dict of drawables, keyed by type
6342

6343
        Parameters
6344
        ----------
6345
        biotype
6346
            passed to get_features(biotype). Can be a single biotype or
6347
            series. Only features matching this will be included.
6348
        """
6349
        result = defaultdict(list)
6✔
6350
        for f in self.get_features(biotype=biotype, allow_partial=True):
6✔
6351
            result[f.biotype].append(f.get_drawable())
6✔
6352
        return result
6✔
6353

6354
    def get_drawable(
6✔
6355
        self,
6356
        *,
6357
        biotype: Optional[str, typing.Iterable[str]] = None,
6358
        width: int = 600,
6359
        vertical: int = False,
6360
        title: OptStr = None,
6361
    ):
6362
        """make a figure from sequence features
6363

6364
        Parameters
6365
        ----------
6366
        biotype
6367
            passed to get_features(biotype). Can be a single biotype or
6368
            series. Only features matching this will be included.
6369
        width
6370
            width in pixels
6371
        vertical
6372
            rotates the drawable
6373
        title
6374
            title for the plot
6375

6376
        Returns
6377
        -------
6378
        a Drawable instance
6379
        """
6380
        # TODO gah I think this needs to be modified to make row-blocks
6381
        # for each sequence in the alignment, or are we displaying the
6382
        # sequence name in the feature label?
6383
        from cogent3.draw.drawable import Drawable
6✔
6384

6385
        drawables = self.get_drawables(biotype=biotype)
6✔
6386
        if not drawables:
6✔
6387
            return None
6✔
6388
        # we order by tracks
6389
        top = 0
6✔
6390
        space = 0.25
6✔
6391
        annotes = []
6✔
6392
        for feature_type in drawables:
6✔
6393
            new_bottom = top + space
6✔
6394
            for i, annott in enumerate(drawables[feature_type]):
6✔
6395
                annott.shift(y=new_bottom - annott.bottom)
6✔
6396
                if i > 0:
6✔
6397
                    # refactor: design
6398
                    # modify the api on annott, we should not be using
6399
                    # a private attribute!
6400
                    annott._showlegend = False
6✔
6401
                annotes.append(annott)
6✔
6402

6403
            top = annott.top
6✔
6404

6405
        top += space
6✔
6406
        height = max((top / len(self)) * width, 300)
6✔
6407
        xaxis = {"range": [0, len(self)], "zeroline": False, "showline": True}
6✔
6408
        yaxis = {
6✔
6409
            "range": [0, top],
6410
            "visible": False,
6411
            "zeroline": True,
6412
            "showline": True,
6413
        }
6414

6415
        if vertical:
6✔
6416
            all_traces = [t.T.as_trace() for t in annotes]
6✔
6417
            width, height = height, width
6✔
6418
            xaxis, yaxis = yaxis, xaxis
6✔
6419
        else:
6420
            all_traces = [t.as_trace() for t in annotes]
6✔
6421

6422
        drawer = Drawable(title=title, traces=all_traces, width=width, height=height)
6✔
6423
        drawer.layout.update(xaxis=xaxis, yaxis=yaxis)
6✔
6424
        return drawer
6✔
6425

6426
    def seqlogo(
6✔
6427
        self,
6428
        width: float = 700,
6429
        height: float = 100,
6430
        wrap: OptInt = None,
6431
        vspace: float = 0.005,
6432
        colours: dict | None = None,
6433
    ):
6434
        """returns Drawable sequence logo using mutual information
6435

6436
        Parameters
6437
        ----------
6438
        width, height
6439
            plot dimensions in pixels
6440
        wrap
6441
            number of alignment columns per row
6442
        vspace
6443
            vertical separation between rows, as a proportion of total plot
6444
        colours
6445
            mapping of characters to colours. If note provided, defaults to
6446
            custom for everything ecept protein, which uses protein moltype
6447
            colours.
6448

6449
        Notes
6450
        -----
6451
        Computes MI based on log2 and includes the gap state, so the maximum
6452
        possible value is -log2(1/num_states)
6453
        """
6454
        assert 0 <= vspace <= 1, "vertical space must be in range 0, 1"
6✔
6455
        freqs = self.counts_per_pos(allow_gap=True).to_freq_array()
6✔
6456
        if colours is None and "protein" in self.moltype.label:
6✔
6457
            colours = self.moltype._colors
×
6458

6459
        return freqs.logo(
6✔
6460
            width=width,
6461
            height=height,
6462
            wrap=wrap,
6463
            vspace=vspace,
6464
            colours=colours,
6465
        )
6466

6467
    def coevolution(
6✔
6468
        self,
6469
        stat: str = "nmi",
6470
        segments: list[tuple[int, int]] | None = None,
6471
        drawable: OptStr = None,
6472
        show_progress: bool = False,
6473
        parallel: bool = False,
6474
        par_kw: OptDict = None,
6475
    ):
6476
        """performs pairwise coevolution measurement
6477

6478
        Parameters
6479
        ----------
6480
        stat
6481
            coevolution metric, defaults to 'nmi' (Normalized Mutual
6482
            Information). Valid choices are 'rmi' (Resampled Mutual Information)
6483
            and 'mi', mutual information.
6484
        segments
6485
            coordinates of the form [(start, end), ...] where all possible
6486
            pairs of alignment positions within and between segments are
6487
            examined.
6488
        drawable
6489
            Result object is capable of plotting data specified type. str value
6490
            must be one of plot type 'box', 'heatmap', 'violin'.
6491
        show_progress
6492
            shows a progress bar.
6493
        parallel
6494
            run in parallel, according to arguments in par_kwargs.
6495
        par_kw
6496
            dict of values for configuring parallel execution.
6497

6498
        Returns
6499
        -------
6500
        DictArray of results with lower-triangular values. Upper triangular
6501
        elements and estimates that could not be computed for numerical reasons
6502
        are set as nan
6503
        """
6504
        from cogent3.draw.drawable import AnnotatedDrawable, Drawable
6✔
6505
        from cogent3.evolve import coevolution as coevo
6✔
6506
        from cogent3.util.union_dict import UnionDict
6✔
6507

6508
        # refactor: design
6509
        # These graphical representations of matrices should be separate functions
6510
        # in the drawing submodule somewhere
6511
        stat = stat.lower()
6✔
6512
        if segments:
6✔
6513
            segments = [range(*segment) for segment in segments]
×
6514

6515
        result = coevo.coevolution_matrix(
6✔
6516
            alignment=self,
6517
            stat=stat,
6518
            positions=segments,
6519
            show_progress=show_progress,
6520
            parallel=parallel,
6521
            par_kw=par_kw,
6522
        )
6523
        if drawable is None:
6✔
6524
            return result
6✔
6525

6526
        trace_name = pathlib.Path(self.source).name if self.source else None
6✔
6527
        if drawable in ("box", "violin"):
6✔
6528
            trace = UnionDict(
6✔
6529
                type=drawable,
6530
                y=result.array.flatten(),
6531
                showlegend=False,
6532
                name="",
6533
            )
6534
            draw = Drawable(
6✔
6535
                width=500,
6536
                height=500,
6537
                title=trace_name,
6538
                ytitle=stat.upper(),
6539
            )
6540
            draw.add_trace(trace)
6✔
6541
            result = draw.bound_to(result)
6✔
6542
        elif drawable:
6✔
6543
            axis_title = "Alignment Position"
6✔
6544
            axis_args = {
6✔
6545
                "showticklabels": True,
6546
                "mirror": True,
6547
                "showgrid": False,
6548
                "showline": True,
6549
                "zeroline": False,
6550
            }
6551
            height = 500
6✔
6552
            width = height
6✔
6553
            draw = Drawable(
6✔
6554
                width=width,
6555
                height=height,
6556
                xtitle=axis_title,
6557
                ytitle=axis_title,
6558
                title=trace_name,
6559
            )
6560

6561
            trace = UnionDict(
6✔
6562
                type="heatmap",
6563
                z=result.array,
6564
                colorbar={"title": {"text": stat.upper(), "font": {"size": 16}}},
6565
            )
6566
            draw.add_trace(trace)
6✔
6567
            draw.layout.xaxis.update(axis_args)
6✔
6568
            draw.layout.yaxis.update(axis_args)
6✔
6569

6570
            try:
6✔
6571
                bottom = self.get_drawable()
6✔
6572
                left = self.get_drawable(vertical=True)
6✔
6573
            except AttributeError:
×
6574
                bottom = False
×
6575

6576
            if bottom and drawable != "box":
6✔
6577
                xlim = 1.2
6✔
6578
                draw.layout.width = height * xlim
6✔
6579
                layout = {"legend": {"x": xlim, "y": 1}}
6✔
6580
                draw = AnnotatedDrawable(
6✔
6581
                    draw,
6582
                    left_track=left,
6583
                    bottom_track=bottom,
6584
                    xtitle=axis_title,
6585
                    ytitle=axis_title,
6586
                    xrange=[0, len(self)],
6587
                    yrange=[0, len(self)],
6588
                    layout=layout,
6589
                )
6590

6591
            result = draw.bound_to(result)
6✔
6592

6593
        return result
6✔
6594

6595
    def information_plot(
6✔
6596
        self,
6597
        width: OptInt = None,
6598
        height: OptInt = None,
6599
        window: OptInt = None,
6600
        stat: str = "median",
6601
        include_gap: bool = True,
6602
    ):
6603
        """plot information per position
6604

6605
        Parameters
6606
        ----------
6607
        width
6608
            figure width in pixels
6609
        height
6610
            figure height in pixels
6611
        window
6612
            used for smoothing line, defaults to sqrt(length)
6613
        stat
6614
            'mean' or 'median, used as the summary statistic for each window
6615
        include_gap
6616
            whether to include gap counts, shown on right y-axis
6617
        """
6618
        from cogent3.draw.drawable import AnnotatedDrawable, Drawable
6✔
6619

6620
        # refactor: design
6621
        # These graphical representations of matrices should be separate functions
6622
        # in the drawing submodule somewhere
6623
        window = window or numpy.sqrt(len(self))
6✔
6624
        window = int(window)
6✔
6625
        y = self.entropy_per_pos()
6✔
6626
        nan_indices = numpy.isnan(y)
6✔
6627
        if nan_indices.sum() == y.shape[0]:  # assuming 1D array
6✔
6628
            y.fill(0.0)
×
6629
        max_entropy = y[nan_indices == False].max()  # noqa
6✔
6630
        y = max_entropy - y  # convert to information
6✔
6631
        # now make all nan's 0
6632
        y[nan_indices] = 0
6✔
6633
        stats = {"mean": numpy.mean, "median": numpy.median}
6✔
6634
        if stat not in stats:
6✔
6635
            msg = 'stat must be either "mean" or "median"'
×
6636
            raise ValueError(msg)
×
6637
        calc_stat = stats[stat]
6✔
6638
        num = len(y) - window
6✔
6639
        v = [calc_stat(y[i : i + window]) for i in range(num)]
6✔
6640
        x = numpy.arange(num)
6✔
6641
        x += window // 2  # shift x-coordinates to middle of window
6✔
6642
        trace_line = UnionDict(
6✔
6643
            type="scatter",
6644
            x=x,
6645
            y=v,
6646
            mode="lines",
6647
            name=f"smoothed {stat}",
6648
            line={"shape": "spline", "smoothing": 1.3},
6649
        )
6650
        trace_marks = UnionDict(
6✔
6651
            type="scatter",
6652
            x=numpy.arange(y.shape[0]),
6653
            y=y,
6654
            mode="markers",
6655
            opacity=0.5,
6656
            name="per position",
6657
        )
6658
        layout = UnionDict(
6✔
6659
            title="Information per position",
6660
            width=width,
6661
            height=height,
6662
            showlegend=True,
6663
            yaxis={"range": [0, max(y) * 1.2], "showgrid": False},
6664
            xaxis={
6665
                "showgrid": False,
6666
                "range": [0, len(self)],
6667
                "mirror": True,
6668
                "showline": True,
6669
            },
6670
        )
6671

6672
        traces = [trace_marks, trace_line]
6✔
6673
        if include_gap:
6✔
6674
            gap_counts = self.count_gaps_per_pos()
6✔
6675
            y = [calc_stat(gap_counts[i : i + window]) for i in range(num)]
6✔
6676
            trace_g = UnionDict(
6✔
6677
                type="scatter",
6678
                x=x,
6679
                y=y,
6680
                yaxis="y2",
6681
                name="Gaps",
6682
                mode="lines",
6683
                line={"shape": "spline", "smoothing": 1.3},
6684
            )
6685
            traces += [trace_g]
6✔
6686
            layout.yaxis2 = {
6✔
6687
                "title": "Count",
6688
                "side": "right",
6689
                "overlaying": "y",
6690
                "range": [0, max(gap_counts) * 1.2],
6691
                "showgrid": False,
6692
                "showline": True,
6693
            }
6694

6695
        draw = Drawable(
6✔
6696
            title="Information per position",
6697
            xtitle="Position",
6698
            ytitle=f"Information (window={window})",
6699
        )
6700
        draw.traces.extend(traces)
6✔
6701
        draw.layout |= layout
6✔
6702
        draw.layout.legend = {"x": 1.1, "y": 1}
6✔
6703

6704
        try:
6✔
6705
            drawable = self.get_drawable()
6✔
6706
        except AttributeError:
×
6707
            drawable = False
×
6708

6709
        if drawable:
6✔
6710
            draw = AnnotatedDrawable(
×
6711
                draw,
6712
                bottom_track=drawable,
6713
                xtitle="position",
6714
                ytitle=f"Information (window={window})",
6715
                layout=layout,
6716
            )
6717

6718
        return draw
6✔
6719

6720
    def to_phylip(self) -> str:
6✔
6721
        """
6722
        Return collection in PHYLIP format and mapping to sequence ids
6723

6724
        Notes
6725
        -----
6726
        raises exception if sequences do not all have the same length
6727
        """
6728
        phylip = cogent3._plugin.get_seq_format_writer_plugin(format_name="phylip")  # noqa: SLF001
6✔
6729
        return phylip.formatted(self)
6✔
6730

6731
    def to_pretty(
6✔
6732
        self,
6733
        name_order: list[str] | None = None,
6734
        wrap: int | None = None,
6735
    ) -> str:
6736
        """returns a string representation of the alignment in pretty print format
6737

6738
        Parameters
6739
        ----------
6740
        name_order
6741
            order of names for display.
6742
        wrap
6743
            maximum number of printed bases
6744
        """
6745
        names, output = self._get_raw_pretty(name_order=name_order)
6✔
6746
        label_width = max(list(map(len, names)))
6✔
6747
        name_template = f"{{:>{label_width}}}"
6✔
6748
        display_names = {n: name_template.format(n) for n in names}
6✔
6749

6750
        def make_line(label, seq) -> str:
6✔
6751
            return f"{label}    {seq}"
6✔
6752

6753
        if wrap is None:
6✔
6754
            result = [make_line(display_names[n], "".join(output[n])) for n in names]
6✔
6755
            return "\n".join(result)
6✔
6756

6757
        align_length = len(self)
6✔
6758
        result = []
6✔
6759
        for start in range(0, align_length, wrap):
6✔
6760
            for n in names:
6✔
6761
                result.append(
6✔
6762
                    make_line(
6763
                        display_names[n],
6764
                        "".join(output[n][start : start + wrap]),
6765
                    ),
6766
                )
6767

6768
            result.append("")
6✔
6769

6770
        if result and not result[-1]:
6✔
6771
            del result[-1]
6✔
6772

6773
        return "\n".join(result)
6✔
6774

6775
    def to_html(
6✔
6776
        self,
6777
        name_order: typing.Sequence[str] | None = None,
6778
        wrap: int = 60,
6779
        limit: int | None = None,
6780
        ref_name: str = "longest",
6781
        colors: Mapping[str, str] | None = None,
6782
        font_size: int = 12,
6783
        font_family: str = "Lucida Console",
6784
    ) -> str:
6785
        """returns html with embedded styles for sequence colouring
6786

6787
        Parameters
6788
        ----------
6789
        name_order
6790
            order of names for display.
6791
        wrap
6792
            number of alignment columns per row
6793
        limit
6794
            truncate alignment to this length
6795
        ref_name
6796
            Name of an existing sequence or 'longest'. If the latter, the
6797
            longest sequence (excluding gaps and ambiguities) is selected as the
6798
            reference.
6799
        colors
6800
            {character
6801
            moltype.
6802
        font_size
6803
            in points. Affects labels and sequence and line spacing
6804
            (proportional to value)
6805
        font_family
6806
            string denoting font family
6807

6808
        Examples
6809
        --------
6810

6811
        In a jupyter notebook, this code is used to provide the representation.
6812

6813
        .. code-block:: python
6814

6815
            aln  # is rendered by jupyter
6816

6817
        You can directly use the result for display in a notebook as
6818

6819
        .. code-block:: python
6820

6821
            from IPython.core.display import HTML
6822

6823
            HTML(aln.to_html())
6824
        """
6825
        css, styles = self.moltype.get_css_style(
6✔
6826
            colors=colors,
6827
            font_size=font_size,
6828
            font_family=font_family,
6829
        )
6830
        if name_order:
6✔
6831
            selected = self.take_seqs(name_order)
×
6832
            name_order = list(name_order)
×
6833
        else:
6834
            name_order = list(self.names)
6✔
6835
            ref_name = ref_name or "longest"
6✔
6836
            selected = self
6✔
6837

6838
        if ref_name == "longest":
6✔
6839
            lengths = selected.get_lengths(include_ambiguity=False, allow_gap=False)
6✔
6840

6841
            length_names = defaultdict(list)
6✔
6842
            for n, l in lengths.items():
6✔
6843
                length_names[l].append(n)
6✔
6844

6845
            longest = max(length_names)
6✔
6846
            ref = sorted(length_names[longest])[0]
6✔
6847

6848
        elif ref_name:
6✔
6849
            if ref_name not in selected.names:
6✔
6850
                msg = f"Unknown sequence name {ref_name}"
×
6851
                raise ValueError(msg)
×
6852
            ref = ref_name
6✔
6853

6854
        name_order.remove(ref)
6✔
6855
        name_order.insert(0, ref)
6✔
6856

6857
        if limit is None:
6✔
6858
            names, output = selected._get_raw_pretty(name_order)
6✔
6859
        else:
6860
            names, output = selected[:limit]._get_raw_pretty(name_order)
×
6861

6862
        refname = names[0]
6✔
6863
        refseq = output[refname]
6✔
6864
        seqlen = len(refseq)
6✔
6865

6866
        if selected.moltype.gaps:
6✔
6867
            gaps = "".join(selected.moltype.gaps)
6✔
6868
            start_gap = re.search(f"^[{gaps}]+", "".join(refseq))
6✔
6869
            end_gap = re.search(f"[{gaps}]+$", "".join(refseq))
6✔
6870
            start = 0 if start_gap is None else start_gap.end()
6✔
6871
            end = seqlen if end_gap is None else end_gap.start()
6✔
6872
        else:
6873
            start = 0
6✔
6874
            end = seqlen
6✔
6875

6876
        seq_style = []
6✔
6877
        template = '<span class="%s">%%s</span>'
6✔
6878
        styled_seqs = defaultdict(list)
6✔
6879
        for i in range(seqlen):
6✔
6880
            char = refseq[i]
6✔
6881
            if i < start or i >= end:
6✔
6882
                style = f"terminal_ambig_{selected.moltype.label}"
6✔
6883
            else:
6884
                style = styles[char]
6✔
6885

6886
            seq_style.append(template % style)
6✔
6887
            styled_seqs[refname].append(seq_style[-1] % char)
6✔
6888

6889
        for name in names:
6✔
6890
            if name == refname:
6✔
6891
                continue
6✔
6892

6893
            seq = []
6✔
6894
            for i, c in enumerate(output[name]):
6✔
6895
                if c == ".":
6✔
6896
                    s = seq_style[i] % c
6✔
6897
                else:
6898
                    s = template % (styles[c])
6✔
6899
                    s = s % c
6✔
6900
                seq.append(s)
6✔
6901

6902
            styled_seqs[name] = seq
6✔
6903

6904
        # make a html table
6905
        seqs = numpy.array([styled_seqs[n] for n in names], dtype="O")
6✔
6906
        table = ["<table>"]
6✔
6907
        seq_ = "<td>%s</td>"
6✔
6908
        label_ = '<td class="label">%s</td>'
6✔
6909
        num_row_ = '<tr class="num_row"><td></td><td><b>{:,d}</b></td></tr>'
6✔
6910
        for i in range(0, seqlen, wrap):
6✔
6911
            table.append(num_row_.format(i))
6✔
6912
            seqblock = seqs[:, i : i + wrap].tolist()
6✔
6913
            for n, s in zip(names, seqblock, strict=False):
6✔
6914
                s = "".join(s)
6✔
6915
                row = "".join([label_ % n, seq_ % s])
6✔
6916
                table.append(f"<tr>{row}</tr>")
6✔
6917
        table.append("</table>")
6✔
6918
        if (limit and limit < len(selected)) or (
6✔
6919
            name_order and len(name_order) < len(selected.names)
6920
        ):
6921
            summary = (
×
6922
                f"{self.num_seqs} x {len(self)} (truncated to "
6923
                f"{len(name_order) if name_order else len(selected.names)} x "
6924
                f"{limit or len(selected)}) {selected.moltype.label} alignment"
6925
            )
6926
        else:
6927
            summary = (
6✔
6928
                f"{self.num_seqs} x {len(self)} {selected.moltype.label} alignment"
6929
            )
6930

6931
        text = [
6✔
6932
            "<style>",
6933
            ".c3align table {margin: 10px 0;}",
6934
            ".c3align td { border: none !important; text-align: left !important; }",
6935
            ".c3align tr:not(.num_row) td span {margin: 0 2px;}",
6936
            ".c3align tr:nth-child(even) {background: #f7f7f7;}",
6937
            ".c3align .num_row {background-color:rgba(161, 195, 209, 0.5) !important; border-top: solid 1px black; }",
6938
            ".c3align .label { font-size: %dpt ; text-align: right !important; "
6939
            "color: black !important; padding: 0 4px; display: table-cell !important; "
6940
            "font-weight: normal !important; }" % font_size,
6941
            "\n".join([f".c3align {style}" for style in css]),
6942
            "</style>",
6943
            '<div class="c3align">',
6944
            "\n".join(table),
6945
            f"<p><i>{summary}</i></p>",
6946
            "</div>",
6947
        ]
6948
        return "\n".join(text)
6✔
6949

6950
    def _get_raw_pretty(self, name_order):
6✔
6951
        """returns dict {name: seq, ...} for pretty print"""
6952
        if name_order is not None:
6✔
6953
            assert set(name_order) <= set(self.names), "names don't match"
6✔
6954

6955
        output = defaultdict(list)
6✔
6956
        names = name_order or self.names
6✔
6957
        num_seqs = len(names)
6✔
6958

6959
        seqs = [str(self.seqs[name]) for name in names]
6✔
6960
        positions = list(zip(*seqs, strict=False))
6✔
6961

6962
        for position in positions:
6✔
6963
            ref = position[0]
6✔
6964
            output[names[0]].append(ref)
6✔
6965
            for seq_num in range(1, num_seqs):
6✔
6966
                val = "." if position[seq_num] == ref else position[seq_num]
6✔
6967
                output[names[seq_num]].append(val)
6✔
6968

6969
        return names, output
6✔
6970

6971
    def _repr_html_(self) -> str:
6✔
6972
        settings = self._repr_policy.copy()
×
6973
        env_vals = get_setting_from_environ(
×
6974
            "COGENT3_ALIGNMENT_REPR_POLICY",
6975
            {"num_seqs": int, "num_pos": int, "wrap": int, "ref_name": str},
6976
        )
6977
        settings.update(env_vals)
×
6978
        return self.to_html(
×
6979
            name_order=self.names[: settings["num_seqs"]],
6980
            ref_name=settings["ref_name"],
6981
            limit=settings["num_pos"],
6982
            wrap=settings["wrap"],
6983
        )
6984

6985
    def _mapped(self, slicemap) -> Self:
6✔
6986
        seqs = {}
6✔
6987
        maps = {}
6✔
6988
        for aligned in self.seqs:
6✔
6989
            seq, im = aligned.slice_with_map(slicemap)
6✔
6990
            name = self.name_map[aligned.name]
6✔
6991
            seqs[name] = seq
6✔
6992
            maps[name] = im
6✔
6993

6994
        data = self._seqs_data.from_seqs_and_gaps(
6✔
6995
            seqs=seqs,
6996
            gaps=maps,
6997
            alphabet=self.moltype.most_degen_alphabet(),
6998
        )
6999
        kwargs = self._get_init_kwargs()
6✔
7000
        kwargs["seqs_data"] = data
6✔
7001
        kwargs.pop("slice_record", None)
6✔
7002
        kwargs.pop("annotation_db", None)
6✔
7003
        return self.__class__(**kwargs)
6✔
7004

7005
    def apply_scaled_gaps(
6✔
7006
        self,
7007
        other: SequenceCollection,
7008
        aa_to_codon: bool = True,
7009
    ) -> Self:
7010
        """applies gaps in self to ungapped sequences"""
7011
        assert set(other.names) == set(self.names), "Names must match"
6✔
7012
        if aa_to_codon and not all(
6✔
7013
            (not self.moltype.is_nucleic, other.moltype.is_nucleic),
7014
        ):
7015
            msg = "aa_to_codon True requires nucleic moltype destination not {self.moltype.name!r}"
6✔
7016
            raise ValueError(
6✔
7017
                msg,
7018
            )
7019
        if not aa_to_codon and not all(
6✔
7020
            (self.moltype.is_nucleic, not other.moltype.is_nucleic),
7021
        ):
7022
            msg = f"aa_to_codon False requires protein moltype destination not {other.moltype.name!r}"
6✔
7023
            raise ValueError(
6✔
7024
                msg,
7025
            )
7026

7027
        assert aa_to_codon is not None
6✔
7028
        gaps = {}
6✔
7029
        seqs = {}
6✔
7030
        scale = 3 if aa_to_codon else 1 / 3
6✔
7031

7032
        for seq in self.seqs:
6✔
7033
            parent_name = self._name_map[seq.name]
6✔
7034
            gaps[parent_name] = seq.map.scaled(scale).array
6✔
7035
            ungapped = numpy.array(other.seqs[seq.name])
6✔
7036
            seqs[parent_name] = ungapped
6✔
7037
            seq_len = self._seqs_data.get_seq_length(parent_name)
6✔
7038
            if len(ungapped) != int(seq_len * scale):
6✔
7039
                msg = f"Destination sequence for {seq.name!r} != {scale:.2f} x {seq_len} sequence"
6✔
7040
                raise ValueError(
6✔
7041
                    msg,
7042
                )
7043

7044
        seq_data = self._seqs_data.from_seqs_and_gaps(
6✔
7045
            seqs=seqs,
7046
            gaps=gaps,
7047
            alphabet=other.moltype.most_degen_alphabet(),
7048
        )
7049
        init_args = self._get_init_kwargs()
6✔
7050
        init_args.pop("slice_record")  # slice is realised
6✔
7051
        init_args["moltype"] = other.moltype
6✔
7052
        init_args["seqs_data"] = seq_data
6✔
7053
        init_args["annotation_db"] = other.annotation_db
6✔
7054
        return self.__class__(**init_args)
6✔
7055

7056
    def copy(self, copy_annotations: bool = False) -> Self:
6✔
7057
        """creates new instance, only mutable attributes are copied"""
7058
        kwargs = self._get_init_kwargs()
6✔
7059
        if copy_annotations:
6✔
7060
            kwargs["annotation_db"] = copy.deepcopy(self._annotation_db)
6✔
7061
        return self.__class__(**kwargs)
6✔
7062

7063
    def deepcopy(self, **kwargs) -> Self:
6✔
7064
        """returns deep copy of self
7065

7066
        Notes
7067
        -----
7068
        Reduced to sliced sequences in self, kwargs are ignored.
7069
        Annotation db is not copied if the alignment has been sliced.
7070
        """
7071
        import copy
6✔
7072

7073
        kwargs = self._get_init_kwargs()
6✔
7074
        kwargs.pop("seqs_data")
6✔
7075
        kwargs["annotation_db"] = (
6✔
7076
            None
7077
            if len(self) != self._seqs_data.align_len
7078
            else copy.deepcopy(self._annotation_db)
7079
        )
7080
        new_seqs_data = {
6✔
7081
            n: self._seqs_data.get_gapped_seq_array(
7082
                seqid=n,
7083
                start=self._slice_record.plus_start,
7084
                stop=self._slice_record.plus_stop,
7085
                step=self._slice_record.plus_step,
7086
            )
7087
            for n in self._name_map.values()
7088
        }
7089
        new_seqs_data = self._seqs_data.from_seqs(
6✔
7090
            data=new_seqs_data,
7091
            alphabet=self.moltype.most_degen_alphabet(),
7092
        )
7093
        kwargs["seqs_data"] = new_seqs_data
6✔
7094
        kwargs["slice_record"] = (
6✔
7095
            c3_sequence.SliceRecord(parent_len=new_seqs_data.align_len, step=-1)
7096
            if self._slice_record.is_reversed
7097
            else None
7098
        )
7099
        return self.__class__(**kwargs)
6✔
7100

7101
    def with_masked_annotations(
6✔
7102
        self,
7103
        biotypes: PySeqStr,
7104
        mask_char: str = "?",
7105
        shadow: bool = False,
7106
        seqid: str | None = None,
7107
    ) -> Self:
7108
        """returns an alignment with regions replaced by mask_char
7109

7110
        Parameters
7111
        ----------
7112
        biotypes
7113
            annotation type(s)
7114
        mask_char
7115
            must be a character valid for the moltype. The default value is
7116
            the most ambiguous character, eg. '?' for DNA
7117
        shadow
7118
            If True, masks everything but the biotypes
7119
        seqid
7120
            name of sequence to mask, defaults to all
7121
        """
7122
        # by doing numpy.array(seq), this method applies the slice_record
7123
        # and modifies the underlying sequence data. We therefore split
7124
        # gapped sequences into their gaps and seq components and discard
7125
        # the current slice_record.
7126
        gaps = {}
6✔
7127
        ungapped = {}
6✔
7128
        for seq in self.seqs:
6✔
7129
            parent_name = self._name_map[seq.name]
6✔
7130
            gaps[parent_name] = seq.map.array
6✔
7131
            if seqid is None or seq.name == seqid:
6✔
7132
                ungapped[parent_name] = numpy.array(
6✔
7133
                    seq.seq.with_masked_annotations(biotypes, mask_char, shadow),
7134
                )
7135
            else:
7136
                ungapped[parent_name] = numpy.array(seq.seq)
6✔
7137

7138
        seq_data = self._seqs_data.from_seqs_and_gaps(
6✔
7139
            seqs=ungapped,
7140
            gaps=gaps,
7141
            alphabet=self.moltype.most_degen_alphabet(),
7142
        )
7143
        init = self._get_init_kwargs()
6✔
7144
        init["seqs_data"] = seq_data
6✔
7145
        # we have to drop the slice record since we have changed the data
7146
        init["slice_record"] = None
6✔
7147
        return self.__class__(**init)
6✔
7148

7149
    def to_rich_dict(self) -> dict[str, str | dict[str, str]]:
6✔
7150
        """returns a json serialisable dict"""
7151
        kwargs = self._get_init_kwargs()
6✔
7152
        kwargs.pop("slice_record")  # slice is realised
6✔
7153
        kwargs["moltype"] = self.moltype.label
6✔
7154
        kwargs.pop("annotation_db", None)  # we dont serialise the annotation db
6✔
7155
        kwargs.pop(
6✔
7156
            "offset",
7157
            None,
7158
        )  # no need for offset since annotation db is not serialised
7159
        kwargs.pop("seqs_data")
6✔
7160

7161
        seqs = {self._name_map[s.name]: str(s) for s in self.seqs}
6✔
7162
        return {
6✔
7163
            "init_args": kwargs,
7164
            "type": get_object_provenance(self),
7165
            "version": __version__,
7166
            "seqs": seqs,
7167
        }
7168

7169
    @classmethod
6✔
7170
    def from_rich_dict(cls, data: dict[str, str | dict[str, str]]) -> Alignment:
6✔
7171
        data["init_args"].pop("annotation_db", None)
6✔
7172
        return make_aligned_seqs(data["seqs"], **data["init_args"])
6✔
7173

7174
    def is_ragged(self) -> bool:
6✔
7175
        """by definition False for an Alignment"""
7176
        return False
6✔
7177

7178
    def strand_symmetry(self, motif_length: int = 1) -> dict[str, TestResult]:
6✔
7179
        """returns dict of strand symmetry test results per ungapped seq"""
7180
        return {
6✔
7181
            s.name: s.seq.strand_symmetry(motif_length=motif_length) for s in self.seqs
7182
        }
7183

7184
    def duplicated_seqs(self) -> list[list[str]]:
6✔
7185
        """returns the names of duplicated sequences
7186

7187
        Notes
7188
        -----
7189
        The gapped sequence is used.
7190
        """
7191
        if not len(self):
6✔
7192
            # all have zero lengths
7193
            return [] if self.num_seqs < 2 else [list(self.names)]
6✔
7194

7195
        return super().duplicated_seqs()
6✔
7196

7197
    def to_dict(self, as_array: bool = False) -> dict[str, str | numpy.ndarray]:
6✔
7198
        """Return a dictionary of sequences.
7199

7200
        Parameters
7201
        ----------
7202
        as_array
7203
            if True, sequences are returned as numpy arrays, otherwise as strings
7204
        """
7205
        arrayseqs = self.array_seqs
6✔
7206
        if as_array:
6✔
7207
            return {n: arrayseqs[i] for i, n in enumerate(self.names)}
6✔
7208

7209
        return {
6✔
7210
            n: self.storage.alphabet.from_indices(arrayseqs[i])
7211
            for i, n in enumerate(self.names)
7212
        }
7213

7214

7215
@register_deserialiser(
6✔
7216
    get_object_provenance(Alignment),
7217
    "cogent3.core.alignment.Alignment",
7218
    "cogent3.core.c3_alignment.Alignment",
7219
)
7220
def deserialise_alignment(data: dict[str, str | dict[str, str]]) -> Alignment:
6✔
7221
    if "init_args" not in data:
6✔
7222
        # old type Alignment class
7223
        return deserialise_alignment_to_new_type_alignment(data)
6✔
7224
    return Alignment.from_rich_dict(data)
6✔
7225

7226

7227
@register_deserialiser("cogent3.core.alignment.ArrayAlignment")
6✔
7228
def deserialise_array_align_to_new_type_alignment(
6✔
7229
    data: dict[str, str | dict[str, str]],
7230
) -> Alignment:
7231
    """deserialise old type ArrayAlignment as a new type alignment."""
7232
    moltype_name = data["moltype"]
6✔
7233
    moltype_name = "text" if moltype_name == "bytes" else moltype_name
6✔
7234
    info_data = data["info"]
6✔
7235
    source = info_data.pop("source", None) if info_data else None
6✔
7236

7237
    seq_data = {}
6✔
7238
    for seqid, record in data["seqs"].items():
6✔
7239
        if isinstance(record["seq"], str):
6✔
7240
            seq = record["seq"]
6✔
7241
        else:
7242
            seq = record["seq"]["init_args"]["seq"]
6✔
7243

7244
        seq_data[seqid] = seq
6✔
7245

7246
    return make_aligned_seqs(
6✔
7247
        seq_data, moltype=moltype_name, info=info_data, source=source
7248
    )
7249

7250

7251
def deserialise_alignment_to_new_type_alignment(
6✔
7252
    data: dict[str, str | dict[str, str]],
7253
) -> Alignment:
7254
    """deserialise old type Alignment as a new type alignment"""
7255
    from cogent3 import get_moltype
6✔
7256
    from cogent3.core.location import deserialise_indelmap
6✔
7257

7258
    moltype_name = data["moltype"]
6✔
7259
    mt = get_moltype(moltype_name)
6✔
7260
    alpha = mt.most_degen_alphabet()
6✔
7261
    info_data = data["info"]
6✔
7262
    source = info_data.pop("source", None)
6✔
7263

7264
    seqs = {}
6✔
7265
    gaps = {}
6✔
7266
    for name, record in data["seqs"].items():
6✔
7267
        seq_data = record["seq_init"]
6✔
7268
        minit = record["map_init"]
6✔
7269
        imap = deserialise_indelmap(minit)
6✔
7270
        raw_seq = seq_data["seq"]["init_args"]["seq"]
6✔
7271
        seqs[name] = raw_seq
6✔
7272
        gaps[name] = imap.array
6✔
7273

7274
    asd = AlignedSeqsData.from_seqs_and_gaps(seqs=seqs, gaps=gaps, alphabet=alpha)
6✔
7275
    return make_aligned_seqs(asd, moltype=moltype_name, info=info_data, source=source)
6✔
7276

7277

7278
def make_aligned_storage(
6✔
7279
    data: dict[str, bytes | numpy.ndarray[int]],
7280
    *,
7281
    moltype: MolTypes,
7282
    label_to_name: OptRenamerCallable = None,
7283
    offset: DictStrInt | None = None,
7284
    reversed_seqs: set[str] | None = None,
7285
    storage_backend: str | None = None,
7286
    **kwargs,
7287
) -> AlignedSeqsDataABC:
7288
    """makes the aligned storage instance for Alignment class
7289

7290
    Parameters
7291
    ----------
7292
    data
7293
        {seq name: sequence, ...}
7294
    moltype
7295
        label for looking up the molecular type
7296
    label_to_name
7297
        renamer function
7298
    offset
7299
        mapping of {name: annotation offset}
7300
    reversed_seqs
7301
        name of sequences that are reverse complemented relative to their
7302
        parent
7303
    storage_backend
7304
        name of a third-party storage driver to provide storage functionality
7305
    kwargs
7306
        additional keyword arguments for the storage driver
7307

7308
    Notes
7309
    -----
7310
    This function is intended for use primarly by make_aligned_seqs function.
7311
    """
7312
    assign_names = _SeqNamer(name_func=label_to_name)
6✔
7313
    moltype = c3_moltype.get_moltype(moltype)
6✔
7314
    alphabet = moltype.most_degen_alphabet()
6✔
7315
    seqs_data, offs, rvd = prep_for_seqs_data(data, moltype, assign_names)
6✔
7316
    asd_kwargs = {
6✔
7317
        "alphabet": alphabet,
7318
        "offset": offset or offs,
7319
        "reversed_seqs": reversed_seqs or rvd,
7320
        "data": seqs_data,
7321
        **kwargs,
7322
    }
7323
    # plugin module is private only to exclude users, not developers
7324
    klass = cogent3._plugin.get_aligned_storage_driver(storage_backend)  # noqa: SLF001
6✔
7325
    return klass.from_seqs(**asd_kwargs)
6✔
7326

7327

7328
@singledispatch
6✔
7329
def make_aligned_seqs(
6✔
7330
    data: dict[str, StrORBytesORArray] | list | AlignedSeqsDataABC,
7331
    *,
7332
    moltype: MolTypes,
7333
    label_to_name: OptRenamerCallable = None,
7334
    info: OptDict = None,
7335
    source: OptPathType = None,
7336
    annotation_db: SupportsFeatures | None = None,
7337
    offset: DictStrInt | None = None,
7338
    name_map: DictStrStr | None = None,
7339
    is_reversed: OptBool = None,
7340
    reversed_seqs: set[str] | None = None,
7341
    storage_backend: str | None = None,
7342
    **kwargs,
7343
) -> Alignment:
7344
    """Initialise an aligned collection of sequences.
7345

7346
    Parameters
7347
    ----------
7348
    data
7349
        sequence data, a AlignedSeqsData, a dict {name: seq, ...}, an iterable of sequences
7350
    moltype
7351
        string representation of the moltype, e.g., 'dna', 'protein'.
7352
    label_to_name
7353
        function for converting original names into other names.
7354
    info
7355
        a dict from which to make an info object
7356
    source
7357
        origins of this data, defaults to 'unknown'. Converted to a string
7358
        and added to info["source"].
7359
    annotation_db
7360
        annotation database to attach to the collection
7361
    offset
7362
        a dict mapping names to annotation offsets
7363
    name_map
7364
        a dict mapping sequence names to "parent" sequence names. The parent
7365
        name will be used for querying a annotation_db.
7366
    is_reversed
7367
        entire collection has been reverse complemented
7368
    reversed_seqs
7369
        set of names that are on the reverse strand of the parent sequence
7370
    storage_backend
7371
        name of the storage backend to use for the SeqsData object, defaults to
7372
        cogent3 builtin.
7373
    kwargs
7374
        keyword arguments for the AlignedSeqsData constructor
7375

7376
    Notes
7377
    -----
7378
    If no annotation_db is provided, but the sequences are annotated, an
7379
    annotation_db is created by merging any annotation db's found in the sequences.
7380
    If the sequences are annotated AND an annotation_db is provided, only the
7381
    annotation_db is used.
7382
    """
7383
    if len(data) == 0:
6✔
7384
        msg = "data must be at least one sequence."
6✔
7385
        raise ValueError(msg)
6✔
7386

7387
    moltype = c3_moltype.get_moltype(moltype)
6✔
7388
    annotation_db = kwargs.pop("annotation_db", annotation_db) or merged_db_collection(
6✔
7389
        data,
7390
    )
7391

7392
    # if we have Sequences, we need to construct the name map before we construct
7393
    # the AlignedSeqsData object - however, if a name_map is provided, we assume that it
7394
    # corrects for any naming differences in data and skip this step
7395
    assign_names = _SeqNamer(name_func=label_to_name)
6✔
7396
    data = _make_name_seq_mapping(data, assign_names)
6✔
7397
    if name_map is None:
6✔
7398
        name_map = make_name_map(data) or None
6✔
7399

7400
    seqs_data = make_aligned_storage(
6✔
7401
        data,
7402
        moltype=moltype,
7403
        label_to_name=label_to_name,
7404
        offset=offset,
7405
        reversed_seqs=reversed_seqs,
7406
        storage_backend=storage_backend,
7407
        **kwargs,
7408
    )
7409
    # as they were handled in this function, we do not pass on:
7410
    # offset
7411
    # label_to_name
7412
    # reversed_seqs
7413
    # storage_backend
7414
    return make_aligned_seqs(
6✔
7415
        seqs_data,
7416
        moltype=moltype,
7417
        info=info,
7418
        annotation_db=annotation_db,
7419
        source=source,
7420
        name_map=name_map,
7421
        is_reversed=is_reversed,
7422
    )
7423

7424

7425
# I'm explicitly encoding the alternate variants of the following as numba
7426
# can then cache the byte-compiled functions.
7427
@make_aligned_seqs.register
6✔
7428
def _(
6✔
7429
    data: AlignedSeqsDataABC,
7430
    *,
7431
    moltype: MolTypes,
7432
    label_to_name: OptRenamerCallable = None,
7433
    info: OptDict = None,
7434
    source: OptPathType = None,
7435
    annotation_db: SupportsFeatures | None = None,
7436
    offset: DictStrInt | None = None,
7437
    name_map: DictStrStr | None = None,
7438
    is_reversed: OptBool = None,
7439
    **kwargs,
7440
) -> Alignment:
7441
    moltype = c3_moltype.get_moltype(moltype)
6✔
7442
    if not moltype.is_compatible_alphabet(data.alphabet):
6✔
7443
        msg = (
6✔
7444
            f"Provided moltype: {moltype.label} is not compatible with AlignedSeqsData"
7445
        )
7446
        raise ValueError(
6✔
7447
            msg,
7448
            f" alphabet: {data.alphabet}",
7449
        )
7450

7451
    # we cannot set offset when creating from an AlignedSeqsData
7452
    if offset:
6✔
7453
        msg = f"Setting offset is not supported for {data=}"
6✔
7454
        raise ValueError(msg)
6✔
7455

7456
    info = info if isinstance(info, dict) else {}
6✔
7457
    source = str(source) if source else str(info.pop("source", "unknown"))
6✔
7458
    sr = (
6✔
7459
        c3_sequence.SliceRecord(parent_len=data.align_len, step=-1)
7460
        if is_reversed
7461
        else None
7462
    )
7463
    aln = Alignment(
6✔
7464
        seqs_data=data,
7465
        moltype=moltype,
7466
        info=info,
7467
        source=source,
7468
        annotation_db=annotation_db,
7469
        name_map=name_map,
7470
        slice_record=sr,
7471
    )
7472
    if label_to_name:
6✔
7473
        aln = aln.rename_seqs(label_to_name)
6✔
7474
    return aln
6✔
7475

7476

7477
@numba.jit(cache=True)
7478
def _var_pos_canonical_or_gap(
7479
    arr: numpy.ndarray,
7480
    gap_index: int,
7481
    missing_index: int,
7482
) -> numpy.ndarray:  # pragma: no cover
7483
    """return boolean array indicating columns with more than one value below threshold
7484

7485
    Parameters
7486
    ----------
7487
    arr
7488
        a 2D array with rows being sequences and columns positions
7489
    gap_index
7490
        value of gap state
7491
    missing_index
7492
        value of missing state
7493

7494
    Returns
7495
    ------
7496
    a boolean array
7497
    """
7498
    # relying on consistent ordering of gap as num canonical + 1
7499
    m, n = arr.shape
7500
    result = numpy.zeros(n, dtype=numpy.bool_)
7501
    for pos in numpy.arange(n):
7502
        last = -1
7503
        for seq in numpy.arange(m):
7504
            state = arr[seq, pos]
7505
            if state <= gap_index or state == missing_index:
7506
                if last == -1:
7507
                    last = state
7508
                elif state != last:
7509
                    result[pos] = True
7510
                    break
7511

7512
    return result
7513

7514

7515
@numba.jit(cache=True)
7516
def _var_pos_canonical(
7517
    arr: numpy.ndarray,
7518
    gap_index: int,
7519
) -> numpy.ndarray:  # pragma: no cover
7520
    """return boolean array indicating columns with more than one value below threshold
7521

7522
    Parameters
7523
    ----------
7524
    arr
7525
        a 2D array with rows being sequences and columns positions
7526
    gap_index
7527
        value of gap state
7528

7529
    Returns
7530
    ------
7531
    a boolean array
7532
    """
7533
    # relying on consistent ordering of gap as num canonical + 1
7534
    m, n = arr.shape
7535
    result = numpy.zeros(n, dtype=numpy.bool_)
7536
    for pos in numpy.arange(n):
7537
        last = -1
7538
        for seq in numpy.arange(m):
7539
            state = arr[seq, pos]
7540
            if state < gap_index:
7541
                if last == -1:
7542
                    last = state
7543
                elif state != last:
7544
                    result[pos] = True
7545
                    break
7546

7547
    return result
7548

7549

7550
@numba.jit(cache=True)
7551
def _var_pos_not_gap(
7552
    arr: numpy.ndarray,
7553
    gap_index: int,
7554
) -> numpy.ndarray:  # pragma: no cover
7555
    """return boolean array indicating columns with more than one value below threshold
7556

7557
    Parameters
7558
    ----------
7559
    arr
7560
        a 2D array with rows being sequences and columns positions
7561
    gap_index
7562
        value of gap state
7563

7564
    Returns
7565
    ------
7566
    a boolean array
7567
    """
7568
    m, n = arr.shape
7569
    result = numpy.zeros(n, dtype=numpy.bool_)
7570
    for pos in numpy.arange(n):
7571
        last = -1
7572
        for seq in numpy.arange(m):
7573
            state = arr[seq, pos]
7574
            if state != gap_index:
7575
                if last == -1:
7576
                    last = state
7577
                elif state != last:
7578
                    result[pos] = True
7579
                    break
7580

7581
    return result
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc