9412281846

Committed 07 Jun 2024 02:10AM UTC coverage: 90.379%. First build

Build # 9412281846

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #1890 from KatherineCaley/seq-collections-refactor

SequenceCollection refactor - general and annotation db methods

Run Details

191 of 265 new or added lines in 3 files covered. (72.08%)

31788 of 35172 relevant lines covered (90.38%)

10.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.55

/src/cogent3/core/new_moltype.py

import dataclasses
import functools
import itertools
import typing

from collections import defaultdict
from string import ascii_letters

import numpy

from cogent3.core import new_alphabet, new_sequence


OptStr = typing.Optional[str]
OptCallable = typing.Optional[typing.Callable]
SeqStrType = typing.Union[list[str], tuple[str, ...]]
StrORBytes = typing.Union[str, bytes]
StrORBytesORArray = typing.Union[str, bytes, numpy.ndarray]
SeqStrBytesType = typing.Union[list[StrORBytes], tuple[StrORBytes, ...]]
StrORArray = typing.Union[str, numpy.ndarray]

IUPAC_gap = "-"

IUPAC_missing = "?"

IUPAC_DNA_chars = "T", "C", "A", "G"
IUPAC_DNA_ambiguities = {
    "N": frozenset(("A", "C", "T", "G")),
    "R": frozenset(("A", "G")),
    "Y": frozenset(("C", "T")),
    "W": frozenset(("A", "T")),
    "S": frozenset(("C", "G")),
    "K": frozenset(("T", "G")),
    "M": frozenset(("C", "A")),
    "B": frozenset(("C", "T", "G")),
    "D": frozenset(("A", "T", "G")),
    "H": frozenset(("A", "C", "T")),
    "V": frozenset(("A", "C", "G")),
}
IUPAC_DNA_ambiguities_complements = {
    "A": "T",
    "C": "G",
    "G": "C",
    "T": "A",
    "-": "-",
    "M": "K",
    "K": "M",
    "N": "N",
    "R": "Y",
    "Y": "R",
    "W": "W",
    "S": "S",
    "X": "X",  # not technically an IUPAC ambiguity, but used by repeatmasker
    "V": "B",
    "B": "V",
    "H": "D",
    "D": "H",
    "?": "?",
}

IUPAC_DNA_complements = {"A": "T", "C": "G", "G": "C", "T": "A", "-": "-"}
# Standard DNA pairing: only Watson-Crick pairs count as pairs
DNA_STANDARD_PAIRS = {
    frozenset(("A", "T")): True,
    frozenset(("C", "G")): True,
}

# note change in standard order from DNA
IUPAC_RNA_chars = ["U", "C", "A", "G"]
IUPAC_RNA_ambiguities = {
    "N": frozenset(("A", "C", "U", "G")),
    "R": frozenset(("A", "G")),
    "Y": frozenset(("C", "U")),
    "W": frozenset(("A", "U")),
    "S": frozenset(("C", "G")),
    "K": frozenset(("U", "G")),
    "M": frozenset(("C", "A")),
    "B": frozenset(("C", "U", "G")),
    "D": frozenset(("A", "U", "G")),
    "H": frozenset(("A", "C", "U")),
    "V": frozenset(("A", "C", "G")),
}

IUPAC_RNA_ambiguities_complements = {
    "A": "U",
    "C": "G",
    "G": "C",
    "U": "A",
    "-": "-",
    "M": "K",
    "K": "M",
    "N": "N",
    "R": "Y",
    "Y": "R",
    "W": "W",
    "S": "S",
    "X": "X",  # not technically an IUPAC ambiguity, but used by repeatmasker
    "V": "B",
    "B": "V",
    "H": "D",
    "D": "H",
    "?": "?",
}

IUPAC_RNA_complements = {"A": "U", "C": "G", "G": "C", "U": "A", "-": "-"}

# Standard RNA pairing: GU pairs count as 'weak' pairs
RNA_STANDARD_PAIRS = {
    frozenset(("A", "U")): True,  # True vs False for 'always' vs 'sometimes' pairing
    frozenset(("C", "G")): True,
    frozenset(("G", "U")): False,
}

# Watson-Crick RNA pairing only: GU pairs don't count as pairs
RNA_W_C_PAIRS = {
    frozenset(("A", "U")): True,
    frozenset(("C", "G")): True,
    frozenset(("U", "A")): True,
}

# RNA pairing with GU counted as standard pairs
RNA_G_U_PAIRS = {
    frozenset(("A", "U")): True,
    frozenset(("C", "G")): True,
    frozenset(("G", "C")): True,
    frozenset(("U", "A")): True,
    frozenset(("G", "U")): True,
    frozenset(("U", "G")): True,
}

# RNA pairing with GU, AA, GA, CA and UU mismatches allowed as weak pairs
RNA_EXTENDED_PAIRS = {
    frozenset({"A", "U"}): True,
    frozenset({"C", "G"}): True,
    frozenset({"G", "U"}): False,
    frozenset({"A"}): False,
    frozenset({"A", "G"}): False,
    frozenset({"A", "C"}): False,
    frozenset({"U"}): False,
}


def make_pairs(
    *,
    pairs: dict[tuple[str, str], bool] = None,
    monomers: tuple[str, ...] = None,
    gaps: str = None,
    degenerates: dict[str, set[str]] = None,
) -> dict[frozenset[str], bool]:
    """Makes a dict of symbol pairs (i,j) -> strictness.

    Expands pairs into all possible pairs using degen symbols.
    Strictness is True if i and j always pair, and False if they 'weakly' pair
    (e.g. GU pairs or if it is possible that they pair).

    If you want to make GU pairs count as 'always matching', pass in pairs
    that have (G,U) and (U, G) mapped to True rather than False.
    """
    result = {}
    pairs = pairs or {}
    monomers = monomers or ()
    gaps = gaps or ()
    degenerates = degenerates or {}

    result |= pairs
    result |= {frozenset((i, j)): False for i in gaps for j in gaps}
    for b, d in itertools.product(monomers, degenerates):
        if any(frozenset((b, e)) in pairs for e in degenerates[d]):
            result[frozenset((b, d))] = False

    for d1, d2 in itertools.combinations_with_replacement(degenerates, 2):
        if any(
            frozenset((e1, e2)) in pairs
            for e1 in degenerates[d1]
            for e2 in degenerates[d2]
        ):
            result[frozenset((d1, d2))] = False

    return result


# RNA_PAIRING_RULES is a dict of {name:(base_pairs,degen_pairs)} where base_pairs
# is a dict with the non-degenerate pairing rules and degen_pairs is a dict with
# both the degenerate and non-degenerate pairing rules.
# NOTE: uses make_pairs to augment the initial dict after construction.
def _build_pairing_rules() -> dict[frozenset[str], bool]:
    pairing_rules = {
        "Standard": RNA_STANDARD_PAIRS,
        "WC": RNA_W_C_PAIRS,
        "GU": RNA_G_U_PAIRS,
        "Extended": RNA_EXTENDED_PAIRS,
    }
    for k, v in list(pairing_rules.items()):
        pairing_rules[k] = (v, make_pairs(pairs=v))
    return pairing_rules


RNA_PAIRING_RULES = _build_pairing_rules()

# protein letters & ambiguity codes
IUPAC_PROTEIN_code_aa = {
    "A": "Alanine",
    "C": "Cysteine",
    "D": "Aspartic Acid",
    "E": "Glutamic Acid",
    "F": "Phenylalanine",
    "G": "Glycine",
    "H": "Histidine",
    "I": "Isoleucine",
    "K": "Lysine",
    "L": "Leucine",
    "M": "Methionine",
    "N": "Asparagine",
    "P": "Proline",
    "Q": "Glutamine",
    "R": "Arginine",
    "S": "Serine",
    "T": "Threonine",
    "V": "Valine",
    "W": "Tryptophan",
    "Y": "Tyrosine",
    "*": "STOP",
}

IUPAC_PROTEIN_chars = (
    "A",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "K",
    "L",
    "M",
    "N",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "Y",
)

PROTEIN_WITH_STOP_chars = (
    "A",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "K",
    "L",
    "M",
    "N",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "Y",
    "*",
)

IUPAC_PROTEIN_ambiguities = {"B": ["N", "D"], "X": IUPAC_PROTEIN_chars, "Z": ["Q", "E"]}

PROTEIN_WITH_STOP_ambiguities = {
    "B": ["N", "D"],
    "X": PROTEIN_WITH_STOP_chars,
    "Z": ["Q", "E"],
}

# styling for moltype display


def _expand_colors(base, colors):
    base = base.copy()
    base.update({ch: clr for chars, clr in colors.items() for ch in chars})
    return base


class _DefaultValue:
    def __init__(self, value):
        self.value = value

    def __call__(self):
        return self.value


_gray = _DefaultValue("gray")
_base_colors = defaultdict(_gray)

NT_COLORS = _expand_colors(
    _base_colors, {"A": "#FF0102", "C": "black", "G": "green", "T": "blue", "U": "blue"}
)

AA_COLORS = _expand_colors(
    _base_colors,
    {
        "GAVLI": "#009999",
        "FYW": "#ff6600",
        "CM": "orange",
        "ST": "#009900",
        "KRH": "#FF0102",
        "DE": "blue",
        "NQ": "#993300",
        "P": "#cc0099",
    },
)


@dataclasses.dataclass
class MolType:
    name: str
    monomers: dataclasses.InitVar[StrORBytes]
    make_seq: dataclasses.InitVar[typing.Type]
    gap: OptStr = IUPAC_gap
    missing: OptStr = IUPAC_missing
    complements: dataclasses.InitVar[typing.Optional[dict[str, str]]] = None
    ambiguities: typing.Optional[dict[str, tuple[str, ...]]] = None
    colors: dataclasses.InitVar[typing.Optional[dict[str, str]]] = None
    pairing_rules: typing.Optional[dict[str, dict[frozenset[str], bool]]] = None

    # private attributes to be delivered via properties
    _monomers: new_alphabet.CharAlphabet = dataclasses.field(init=False)
    _gapped: new_alphabet.CharAlphabet = dataclasses.field(init=False)
    _degen: new_alphabet.CharAlphabet = dataclasses.field(init=False)
    _degen_gapped: new_alphabet.CharAlphabet = dataclasses.field(init=False)
    _colors: dict[str, str] = dataclasses.field(init=False)

    # how to connect this to the sequence constructor and avoid
    # circular imports
    _make_seq: typing.Callable = dataclasses.field(init=False)
    _complement: OptCallable = dataclasses.field(init=False, default=None)

    def __post_init__(
        self,
        monomers: StrORBytes,
        make_seq: typing.Type,
        complements: typing.Optional[dict[str, str]],
        colors: typing.Optional[dict[str, str]],
    ):
        self._colors = colors or defaultdict(_DefaultValue("black"))
        self._make_seq = make_seq
        gap = new_alphabet._coerce_to_type(monomers, self.gap or "")
        missing = new_alphabet._coerce_to_type(monomers, self.missing or "")
        ambigs = new_alphabet._coerce_to_type(monomers, "".join(self.ambiguities or ""))

        self._monomers = new_alphabet.make_alphabet(
            chars=monomers, gap=None, moltype=self
        )
        self._degen = (
            new_alphabet.make_alphabet(chars=monomers + ambigs, gap=None, moltype=self)
            if ambigs
            else None
        )
        self._gapped = (
            new_alphabet.make_alphabet(chars=monomers + gap, gap=self.gap, moltype=self)
            if gap
            else None
        )
        self._degen_gapped = (
            new_alphabet.make_alphabet(
                chars=monomers + gap + ambigs + missing, gap=self.gap, moltype=self
            )
            if ambigs and gap
            else None
        )
        if complements:
            # assume we have a nucleic acid moltype
            dest = "".join(complements[c] for c in self.degen_gapped_alphabet)
            self._complement = new_alphabet.convert_alphabet(
                self.degen_gapped_alphabet.as_bytes(),
                dest.encode("utf8"),
            )

    def __repr__(self):
        name = self.__class__.__name__
        return f"{name}({self.alphabet})"

    def __hash__(self):
        return id(self)

    def __eq__(self, other):
        return id(self) == id(other)

    def __len__(self) -> int:
        return len(self._monomers)

    def __iter__(self):
        yield from self._monomers

    @property
    def label(self):
        """synonym for name"""
        return self.name

    @property
    def alphabet(self):
        """monomers"""
        return self._monomers

    @property
    def degen_alphabet(self):
        """monomers + ambiguous characters"""
        return self._degen

    @property
    def gapped_alphabet(self):
        """monomers + gap"""
        return self._gapped

    @property
    def degen_gapped_alphabet(self):
        """monomers + gap + ambiguous characters"""
        return self._degen_gapped

    def is_valid(self, seq: StrORArray) -> bool:
        """checks against most degenerate alphabet"""
        alpha = next(
            alpha
            for alpha in (self._degen_gapped, self._degen, self._gapped, self._monomers)
            if alpha
        )
        return alpha.is_valid(seq)

    def iter_alphabets(self):
        """yield the different defined alphabets"""
        alphas = (self._monomers, self._gapped, self._degen, self._degen_gapped)
        yield from (a for a in alphas if a)

    def is_compatible_alphabet(
        self, alphabet: new_alphabet.CharAlphabet, strict: bool = True
    ) -> bool:
        """checks that characters in alphabet are equal to a bound alphabet

        Parameters
        ----------
        alphabet
            an Alphabet instance
        strict
            the order of elements must match
        """
        if not strict:
            query = set(alphabet)
            return any(set(alpha) == query for alpha in self.iter_alphabets())

        return any(alpha == alphabet for alpha in self.iter_alphabets())

    def make_seq(self, *, seq: str, name: OptStr = None, check_seq=True, **kwargs):
        if check_seq:
            assert self.is_valid(
                seq
            ), f"{seq[:4]!r} not valid for moltype {self.name!r}"
        return self._make_seq(moltype=self, seq=seq or "", name=name, **kwargs)

    @functools.singledispatchmethod
    def complement(self, seq: StrORBytesORArray) -> str:
        """converts a string or bytes into it's nucleic acid complement"""
        raise TypeError(f"{type(seq)} not supported")

    @complement.register
    def _(self, seq: str) -> str:
        return self.complement(seq.encode("utf8"))

    @complement.register
    def _(self, seq: bytes) -> str:
        return self._complement(seq).decode("utf8")

    @complement.register
    def _(self, seq: numpy.ndarray) -> str:
        return self.complement(self.degen_gapped_alphabet.array_to_bytes(seq))

    def rc(self, seq: str) -> str:
        """reverse reverse complement of a sequence"""
        return self.complement(seq)[::-1]

    @functools.singledispatchmethod
    def is_degenerate(self, seq: StrORBytesORArray) -> bool:
        """checks if a sequence contains degenerate characters"""
        raise TypeError(f"{type(seq)} not supported")

    @is_degenerate.register
    def _(self, seq: bytes) -> bool:
        return self.is_degenerate(self.degen_gapped_alphabet.to_indices(seq))

    @is_degenerate.register
    def _(self, seq: str) -> bool:
        return self.is_degenerate(self.degen_gapped_alphabet.to_indices(seq))

    @is_degenerate.register
    def _(self, seq: numpy.ndarray) -> bool:
        # what index is the first degenerate character
        for index, val in enumerate(self.degen_gapped_alphabet):
            if val in self.ambiguities:
                break
        else:
            return False
        return (seq >= index).any()

    @functools.singledispatchmethod
    def is_gapped(self, seq) -> bool:
        """checks if a sequence contains gaps"""
        raise TypeError(f"{type(seq)} not supported")

    @is_gapped.register
    def _(self, seq: str) -> bool:
        return self.gapped_alphabet.gap_char in seq

    @is_gapped.register
    def _(self, seq: bytes) -> bool:
        return self.is_gapped(seq.decode("utf8"))

    @is_gapped.register
    def _(self, seq: bytes) -> bool:
        return self.is_gapped(seq.decode("utf8"))

    @is_gapped.register
    def _(self, seq: numpy.ndarray) -> bool:
        return (seq == self.degen_gapped_alphabet.gap_index).any()

    @functools.singledispatchmethod
    def get_degenerate_positions(self, seq, include_gap=True) -> numpy.ndarray:
        """returns a boolean array indicating degenerate positions"""
        raise TypeError(f"{type(seq)} not supported")

    @get_degenerate_positions.register
    def _(self, seq: numpy.ndarray, include_gap=True) -> numpy.ndarray:

        for index, val in enumerate(self.degen_gapped_alphabet):
            if include_gap and val in self.gap or val in self.ambiguities:
                break
        return seq >= index

    @get_degenerate_positions.register
    def _(self, seq: str, include_gap=True) -> numpy.ndarray:
        return self.get_degenerate_positions(
            self.degen_gapped_alphabet.to_indices(seq), include_gap
        )

    @get_degenerate_positions.register
    def _(self, seq: bytes, include_gap=True) -> numpy.ndarray:
        return self.get_degenerate_positions(
            self.degen_gapped_alphabet.to_indices(seq), include_gap
        )

    def get_css_style(
        self,
        colors: typing.Optional[dict[str, str]] = None,
        font_size: int = 12,
        font_family="Lucida Console",
    ):
        """returns string of CSS classes and {character: <CSS class name>, ...}

        Parameters
        ----------
        colors
             A dictionary mapping characters to CSS color values.
        font_size
            Font size in points.
        font_family
            Name of a monospace font.

        """
        colors = colors or self._colors
        # !important required to stop some browsers over-riding the style sheet ...!!
        template = (
            '.%s_%s{font-family: "%s",monospace !important; '
            "font-size: %dpt !important; color: %s; }"
        )
        label = self.label or ""
        styles = _STYLE_DEFAULTS[label].copy()
        styles.update(
            {c: "_".join([c, label]) for c in list(self.alphabet) + ["terminal_ambig"]}
        )

        css = [
            template % (char, label, font_family, font_size, colors[char])
            for char in list(styles) + ["ambig"]
        ]

        return css, styles


def _make_moltype_dict() -> dict[str, MolType]:
    """make a dictionary of local name space molecular types"""
    env = globals()
    moltypes = {}
    for obj in env.values():
        if not isinstance(obj, MolType):
            continue
        moltypes[obj.name] = obj

    return moltypes


def get_moltype(name: typing.Union[str, MolType]) -> MolType:
    """returns the moltype with the matching name attribute"""
    if isinstance(name, MolType):
        return name
    name = name.lower()
    if name not in _moltypes:
        raise ValueError(f"unknown moltype {name!r}")
    return _moltypes[name]


def available_moltypes():
    """returns Table listing the available moltypes"""
    from cogent3.util.table import Table

    rows = []
    for n, m in _moltypes.items():
        v = str(m)
        num = len(list(m))
        if num > 10:
            v = f"{v[:39]}..."
        rows.append([n, num, v])

    header = ["Abbreviation", "Number of states", "Moltype"]
    title = "Specify a moltype by the Abbreviation (case insensitive)."

    result = Table(header=header, data=rows, title=title, index_name="Abbreviation")
    result = result.sorted(columns=["Number of states", "Abbreviation"])
    result.format_column("Abbreviation", repr)
    return result


# constant instances of the core molecular types
ASCII = MolType(
    # A default type for text read from a file etc. when we don't
    # want to prematurely assume DNA or Protein.
    monomers="".join(ascii_letters),
    name="text",
    make_seq=new_sequence.Sequence,
)

DNA = MolType(
    monomers="".join(IUPAC_DNA_chars),
    ambiguities=IUPAC_DNA_ambiguities,
    name="dna",
    complements=IUPAC_DNA_ambiguities_complements,
    colors=NT_COLORS,
    make_seq=new_sequence.DnaSequence,
)

RNA = MolType(
    monomers="".join(IUPAC_RNA_chars),
    ambiguities=IUPAC_RNA_ambiguities,
    name="rna",
    complements=IUPAC_RNA_ambiguities_complements,
    colors=NT_COLORS,
    make_seq=new_sequence.RnaSequence,
    pairing_rules=RNA_STANDARD_PAIRS,
)
#
PROTEIN = MolType(
    monomers="".join(IUPAC_PROTEIN_chars),
    ambiguities=IUPAC_PROTEIN_ambiguities,
    name="protein",
    colors=AA_COLORS,
    make_seq=new_sequence.ProteinSequence,
)

PROTEIN_WITH_STOP = MolType(
    monomers="".join(PROTEIN_WITH_STOP_chars),
    ambiguities=PROTEIN_WITH_STOP_ambiguities,
    name="protein_with_stop",
    colors=AA_COLORS,
    make_seq=new_sequence.ProteinWithStopSequence,
)
BYTES = MolType(
    # A default type for arbitrary chars read from a file etc. when we don't
    # want to prematurely assume _anything_ about the data.
    monomers=bytes(bytearray(range(2**8))),
    name="bytes",
    gap=None,
    missing=None,
    make_seq=new_sequence.ByteSequence,
)

# the None value catches cases where a moltype has no label attribute
_STYLE_DEFAULTS = {
    getattr(mt, "label", ""): defaultdict(
        _DefaultValue(f"ambig_{getattr(mt, 'label', '')}")
    )
    for mt in (ASCII, BYTES, DNA, RNA, PROTEIN, PROTEIN_WITH_STOP, None)
}

# build this at end of file
_moltypes = _make_moltype_dict()

1	import dataclasses	12✔
2	import functools	12✔
3	import itertools	12✔
4	import typing	12✔
5
6	from collections import defaultdict	12✔
7	from string import ascii_letters	12✔
8
9	import numpy	12✔
10
11	from cogent3.core import new_alphabet, new_sequence	12✔
12
13
14	OptStr = typing.Optional[str]	12✔
15	OptCallable = typing.Optional[typing.Callable]	12✔
16	SeqStrType = typing.Union[list[str], tuple[str, ...]]	12✔
17	StrORBytes = typing.Union[str, bytes]	12✔
18	StrORBytesORArray = typing.Union[str, bytes, numpy.ndarray]	12✔
19	SeqStrBytesType = typing.Union[list[StrORBytes], tuple[StrORBytes, ...]]	12✔
20	StrORArray = typing.Union[str, numpy.ndarray]	12✔
21
22	IUPAC_gap = "-"	12✔
23
24	IUPAC_missing = "?"	12✔
25
26	IUPAC_DNA_chars = "T", "C", "A", "G"	12✔
27	IUPAC_DNA_ambiguities = {	12✔
28	"N": frozenset(("A", "C", "T", "G")),
29	"R": frozenset(("A", "G")),
30	"Y": frozenset(("C", "T")),
31	"W": frozenset(("A", "T")),
32	"S": frozenset(("C", "G")),
33	"K": frozenset(("T", "G")),
34	"M": frozenset(("C", "A")),
35	"B": frozenset(("C", "T", "G")),
36	"D": frozenset(("A", "T", "G")),
37	"H": frozenset(("A", "C", "T")),
38	"V": frozenset(("A", "C", "G")),
39	}
40	IUPAC_DNA_ambiguities_complements = {	12✔
41	"A": "T",
42	"C": "G",
43	"G": "C",
44	"T": "A",
45	"-": "-",
46	"M": "K",
47	"K": "M",
48	"N": "N",
49	"R": "Y",
50	"Y": "R",
51	"W": "W",
52	"S": "S",
53	"X": "X", # not technically an IUPAC ambiguity, but used by repeatmasker
54	"V": "B",
55	"B": "V",
56	"H": "D",
57	"D": "H",
58	"?": "?",
59	}
60
61	IUPAC_DNA_complements = {"A": "T", "C": "G", "G": "C", "T": "A", "-": "-"}	12✔
62	# Standard DNA pairing: only Watson-Crick pairs count as pairs
63	DNA_STANDARD_PAIRS = {	12✔
64	frozenset(("A", "T")): True,
65	frozenset(("C", "G")): True,
66	}
67
68	# note change in standard order from DNA
69	IUPAC_RNA_chars = ["U", "C", "A", "G"]	12✔
70	IUPAC_RNA_ambiguities = {	12✔
71	"N": frozenset(("A", "C", "U", "G")),
72	"R": frozenset(("A", "G")),
73	"Y": frozenset(("C", "U")),
74	"W": frozenset(("A", "U")),
75	"S": frozenset(("C", "G")),
76	"K": frozenset(("U", "G")),
77	"M": frozenset(("C", "A")),
78	"B": frozenset(("C", "U", "G")),
79	"D": frozenset(("A", "U", "G")),
80	"H": frozenset(("A", "C", "U")),
81	"V": frozenset(("A", "C", "G")),
82	}
83
84	IUPAC_RNA_ambiguities_complements = {	12✔
85	"A": "U",
86	"C": "G",
87	"G": "C",
88	"U": "A",
89	"-": "-",
90	"M": "K",
91	"K": "M",
92	"N": "N",
93	"R": "Y",
94	"Y": "R",
95	"W": "W",
96	"S": "S",
97	"X": "X", # not technically an IUPAC ambiguity, but used by repeatmasker
98	"V": "B",
99	"B": "V",
100	"H": "D",
101	"D": "H",
102	"?": "?",
103	}
104
105	IUPAC_RNA_complements = {"A": "U", "C": "G", "G": "C", "U": "A", "-": "-"}	12✔
106
107	# Standard RNA pairing: GU pairs count as 'weak' pairs
108	RNA_STANDARD_PAIRS = {	12✔
109	frozenset(("A", "U")): True, # True vs False for 'always' vs 'sometimes' pairing
110	frozenset(("C", "G")): True,
111	frozenset(("G", "U")): False,
112	}
113
114	# Watson-Crick RNA pairing only: GU pairs don't count as pairs
115	RNA_W_C_PAIRS = {	12✔
116	frozenset(("A", "U")): True,
117	frozenset(("C", "G")): True,
118	frozenset(("U", "A")): True,
119	}
120
121	# RNA pairing with GU counted as standard pairs
122	RNA_G_U_PAIRS = {	12✔
123	frozenset(("A", "U")): True,
124	frozenset(("C", "G")): True,
125	frozenset(("G", "C")): True,
126	frozenset(("U", "A")): True,
127	frozenset(("G", "U")): True,
128	frozenset(("U", "G")): True,
129	}
130
131	# RNA pairing with GU, AA, GA, CA and UU mismatches allowed as weak pairs
132	RNA_EXTENDED_PAIRS = {	12✔
133	frozenset({"A", "U"}): True,
134	frozenset({"C", "G"}): True,
135	frozenset({"G", "U"}): False,
136	frozenset({"A"}): False,
137	frozenset({"A", "G"}): False,
138	frozenset({"A", "C"}): False,
139	frozenset({"U"}): False,
140	}
141
142
143	def make_pairs(	12✔
144	*,
145	pairs: dict[tuple[str, str], bool] = None,
146	monomers: tuple[str, ...] = None,
147	gaps: str = None,
148	degenerates: dict[str, set[str]] = None,
149	) -> dict[frozenset[str], bool]:
150	"""Makes a dict of symbol pairs (i,j) -> strictness.
151
152	Expands pairs into all possible pairs using degen symbols.
153	Strictness is True if i and j always pair, and False if they 'weakly' pair
154	(e.g. GU pairs or if it is possible that they pair).
155
156	If you want to make GU pairs count as 'always matching', pass in pairs
157	that have (G,U) and (U, G) mapped to True rather than False.
158	"""
159	result = {}	12✔
160	pairs = pairs or {}	12✔
161	monomers = monomers or ()	12✔
162	gaps = gaps or ()	12✔
163	degenerates = degenerates or {}	12✔
164
165	result \|= pairs	12✔
166	result \|= {frozenset((i, j)): False for i in gaps for j in gaps}	12✔
167	for b, d in itertools.product(monomers, degenerates):	12✔
168	if any(frozenset((b, e)) in pairs for e in degenerates[d]):	12✔
169	result[frozenset((b, d))] = False	12✔
170
171	for d1, d2 in itertools.combinations_with_replacement(degenerates, 2):	12✔
172	if any(	12✔
173	frozenset((e1, e2)) in pairs
174	for e1 in degenerates[d1]
175	for e2 in degenerates[d2]
176	):
177	result[frozenset((d1, d2))] = False	12✔
178
179	return result	12✔
180
181
182	# RNA_PAIRING_RULES is a dict of {name:(base_pairs,degen_pairs)} where base_pairs
183	# is a dict with the non-degenerate pairing rules and degen_pairs is a dict with
184	# both the degenerate and non-degenerate pairing rules.
185	# NOTE: uses make_pairs to augment the initial dict after construction.
186	def _build_pairing_rules() -> dict[frozenset[str], bool]:	12✔
187	pairing_rules = {	12✔
188	"Standard": RNA_STANDARD_PAIRS,
189	"WC": RNA_W_C_PAIRS,
190	"GU": RNA_G_U_PAIRS,
191	"Extended": RNA_EXTENDED_PAIRS,
192	}
193	for k, v in list(pairing_rules.items()):	12✔
194	pairing_rules[k] = (v, make_pairs(pairs=v))	12✔
195	return pairing_rules	12✔
196
197
198	RNA_PAIRING_RULES = _build_pairing_rules()	12✔
199
200	# protein letters & ambiguity codes
201	IUPAC_PROTEIN_code_aa = {	12✔
202	"A": "Alanine",
203	"C": "Cysteine",
204	"D": "Aspartic Acid",
205	"E": "Glutamic Acid",
206	"F": "Phenylalanine",
207	"G": "Glycine",
208	"H": "Histidine",
209	"I": "Isoleucine",
210	"K": "Lysine",
211	"L": "Leucine",
212	"M": "Methionine",
213	"N": "Asparagine",
214	"P": "Proline",
215	"Q": "Glutamine",
216	"R": "Arginine",
217	"S": "Serine",
218	"T": "Threonine",
219	"V": "Valine",
220	"W": "Tryptophan",
221	"Y": "Tyrosine",
222	"*": "STOP",
223	}
224
225	IUPAC_PROTEIN_chars = (	12✔
226	"A",
227	"C",
228	"D",
229	"E",
230	"F",
231	"G",
232	"H",
233	"I",
234	"K",
235	"L",
236	"M",
237	"N",
238	"P",
239	"Q",
240	"R",
241	"S",
242	"T",
243	"U",
244	"V",
245	"W",
246	"Y",
247	)
248
249	PROTEIN_WITH_STOP_chars = (	12✔
250	"A",
251	"C",
252	"D",
253	"E",
254	"F",
255	"G",
256	"H",
257	"I",
258	"K",
259	"L",
260	"M",
261	"N",
262	"P",
263	"Q",
264	"R",
265	"S",
266	"T",
267	"U",
268	"V",
269	"W",
270	"Y",
271	"*",
272	)
273
274	IUPAC_PROTEIN_ambiguities = {"B": ["N", "D"], "X": IUPAC_PROTEIN_chars, "Z": ["Q", "E"]}	12✔
275
276	PROTEIN_WITH_STOP_ambiguities = {	12✔
277	"B": ["N", "D"],
278	"X": PROTEIN_WITH_STOP_chars,
279	"Z": ["Q", "E"],
280	}
281
282	# styling for moltype display
283
284
285	def _expand_colors(base, colors):	12✔
286	base = base.copy()	12✔
287	base.update({ch: clr for chars, clr in colors.items() for ch in chars})	12✔
288	return base	12✔
289
290
291	class _DefaultValue:	12✔
292	def __init__(self, value):	12✔
293	self.value = value	12✔
294
295	def __call__(self):	12✔
296	return self.value	12✔
297
298
299	_gray = _DefaultValue("gray")	12✔
300	_base_colors = defaultdict(_gray)	12✔
301
302	NT_COLORS = _expand_colors(	12✔
303	_base_colors, {"A": "#FF0102", "C": "black", "G": "green", "T": "blue", "U": "blue"}
304	)
305
306	AA_COLORS = _expand_colors(	12✔
307	_base_colors,
308	{
309	"GAVLI": "#009999",
310	"FYW": "#ff6600",
311	"CM": "orange",
312	"ST": "#009900",
313	"KRH": "#FF0102",
314	"DE": "blue",
315	"NQ": "#993300",
316	"P": "#cc0099",
317	},
318	)
319
320
321	@dataclasses.dataclass	12✔
322	class MolType:	12✔
323	name: str	12✔
324	monomers: dataclasses.InitVar[StrORBytes]	12✔
325	make_seq: dataclasses.InitVar[typing.Type]	12✔
326	gap: OptStr = IUPAC_gap	12✔
327	missing: OptStr = IUPAC_missing	12✔
328	complements: dataclasses.InitVar[typing.Optional[dict[str, str]]] = None	12✔
329	ambiguities: typing.Optional[dict[str, tuple[str, ...]]] = None	12✔
330	colors: dataclasses.InitVar[typing.Optional[dict[str, str]]] = None	12✔
331	pairing_rules: typing.Optional[dict[str, dict[frozenset[str], bool]]] = None	12✔
332
333	# private attributes to be delivered via properties
334	_monomers: new_alphabet.CharAlphabet = dataclasses.field(init=False)	12✔
335	_gapped: new_alphabet.CharAlphabet = dataclasses.field(init=False)	12✔
336	_degen: new_alphabet.CharAlphabet = dataclasses.field(init=False)	12✔
337	_degen_gapped: new_alphabet.CharAlphabet = dataclasses.field(init=False)	12✔
338	_colors: dict[str, str] = dataclasses.field(init=False)	12✔
339
340	# how to connect this to the sequence constructor and avoid
341	# circular imports
342	_make_seq: typing.Callable = dataclasses.field(init=False)	12✔
343	_complement: OptCallable = dataclasses.field(init=False, default=None)	12✔
344
345	def __post_init__(	12✔
346	self,
347	monomers: StrORBytes,
348	make_seq: typing.Type,
349	complements: typing.Optional[dict[str, str]],
350	colors: typing.Optional[dict[str, str]],
351	):
352	self._colors = colors or defaultdict(_DefaultValue("black"))	12✔
353	self._make_seq = make_seq	12✔
354	gap = new_alphabet._coerce_to_type(monomers, self.gap or "")	12✔
355	missing = new_alphabet._coerce_to_type(monomers, self.missing or "")	12✔
356	ambigs = new_alphabet._coerce_to_type(monomers, "".join(self.ambiguities or ""))	12✔
357
358	self._monomers = new_alphabet.make_alphabet(	12✔
359	chars=monomers, gap=None, moltype=self
360	)
361	self._degen = (	12✔
362	new_alphabet.make_alphabet(chars=monomers + ambigs, gap=None, moltype=self)
363	if ambigs
364	else None
365	)
366	self._gapped = (	12✔
367	new_alphabet.make_alphabet(chars=monomers + gap, gap=self.gap, moltype=self)
368	if gap
369	else None
370	)
371	self._degen_gapped = (	12✔
372	new_alphabet.make_alphabet(
373	chars=monomers + gap + ambigs + missing, gap=self.gap, moltype=self
374	)
375	if ambigs and gap
376	else None
377	)
378	if complements:	12✔
379	# assume we have a nucleic acid moltype
380	dest = "".join(complements[c] for c in self.degen_gapped_alphabet)	12✔
381	self._complement = new_alphabet.convert_alphabet(	12✔
382	self.degen_gapped_alphabet.as_bytes(),
383	dest.encode("utf8"),
384	)
385
386	def __repr__(self):	12✔
387	name = self.__class__.__name__	12✔
388	return f"{name}({self.alphabet})"	12✔
389
390	def __hash__(self):	12✔
391	return id(self)	×
392
393	def __eq__(self, other):	12✔
394	return id(self) == id(other)	×
395
396	def __len__(self) -> int:	12✔
397	return len(self._monomers)	12✔
398
399	def __iter__(self):	12✔
400	yield from self._monomers	12✔
401
402	@property	12✔
403	def label(self):	12✔
404	"""synonym for name"""
405	return self.name	12✔
406
407	@property	12✔
408	def alphabet(self):	12✔
409	"""monomers"""
410	return self._monomers	12✔
411
412	@property	12✔
413	def degen_alphabet(self):	12✔
414	"""monomers + ambiguous characters"""
415	return self._degen	×
416
417	@property	12✔
418	def gapped_alphabet(self):	12✔
419	"""monomers + gap"""
420	return self._gapped	12✔
421
422	@property	12✔
423	def degen_gapped_alphabet(self):	12✔
424	"""monomers + gap + ambiguous characters"""
425	return self._degen_gapped	12✔
426
427	def is_valid(self, seq: StrORArray) -> bool:	12✔
428	"""checks against most degenerate alphabet"""
429	alpha = next(	12✔
430	alpha
431	for alpha in (self._degen_gapped, self._degen, self._gapped, self._monomers)
432	if alpha
433	)
434	return alpha.is_valid(seq)	12✔
435
436	def iter_alphabets(self):	12✔
437	"""yield the different defined alphabets"""
438	alphas = (self._monomers, self._gapped, self._degen, self._degen_gapped)	12✔
439	yield from (a for a in alphas if a)	12✔
440
441	def is_compatible_alphabet(	12✔
442	self, alphabet: new_alphabet.CharAlphabet, strict: bool = True
443	) -> bool:
444	"""checks that characters in alphabet are equal to a bound alphabet
445
446	Parameters
447	----------
448	alphabet
449	an Alphabet instance
450	strict
451	the order of elements must match
452	"""
453	if not strict:	12✔
454	query = set(alphabet)	×
455	return any(set(alpha) == query for alpha in self.iter_alphabets())	×
456
457	return any(alpha == alphabet for alpha in self.iter_alphabets())	12✔
458
459	def make_seq(self, , seq: str, name: OptStr = None, check_seq=True, *kwargs):	12✔
460	if check_seq:	12✔
461	assert self.is_valid(	12✔
462	seq
463	), f"{seq[:4]!r} not valid for moltype {self.name!r}"
464	return self._make_seq(moltype=self, seq=seq or "", name=name, **kwargs)	12✔
465
466	@functools.singledispatchmethod	12✔
467	def complement(self, seq: StrORBytesORArray) -> str:	12✔
468	"""converts a string or bytes into it's nucleic acid complement"""
469	raise TypeError(f"{type(seq)} not supported")	×
470
471	@complement.register	12✔
472	def _(self, seq: str) -> str:	12✔
473	return self.complement(seq.encode("utf8"))	12✔
474
475	@complement.register	12✔
476	def _(self, seq: bytes) -> str:	12✔
477	return self._complement(seq).decode("utf8")	12✔
478
479	@complement.register	12✔
480	def _(self, seq: numpy.ndarray) -> str:	12✔
481	return self.complement(self.degen_gapped_alphabet.array_to_bytes(seq))	12✔
482
483	def rc(self, seq: str) -> str:	12✔
484	"""reverse reverse complement of a sequence"""
485	return self.complement(seq)[::-1]	12✔
486
487	@functools.singledispatchmethod	12✔
488	def is_degenerate(self, seq: StrORBytesORArray) -> bool:	12✔
489	"""checks if a sequence contains degenerate characters"""
490	raise TypeError(f"{type(seq)} not supported")	12✔
491
492	@is_degenerate.register	12✔
493	def _(self, seq: bytes) -> bool:	12✔
494	return self.is_degenerate(self.degen_gapped_alphabet.to_indices(seq))	12✔
495
496	@is_degenerate.register	12✔
497	def _(self, seq: str) -> bool:	12✔
498	return self.is_degenerate(self.degen_gapped_alphabet.to_indices(seq))	12✔
499
500	@is_degenerate.register	12✔
501	def _(self, seq: numpy.ndarray) -> bool:	12✔
502	# what index is the first degenerate character
503	for index, val in enumerate(self.degen_gapped_alphabet):	12✔
504	if val in self.ambiguities:	12✔
505	break	12✔
506	else:
507	return False	×
508	return (seq >= index).any()	12✔
509
510	@functools.singledispatchmethod	12✔
511	def is_gapped(self, seq) -> bool:	12✔
512	"""checks if a sequence contains gaps"""
513	raise TypeError(f"{type(seq)} not supported")	×
514
515	@is_gapped.register	12✔
516	def _(self, seq: str) -> bool:	12✔
517	return self.gapped_alphabet.gap_char in seq	12✔
518
519	@is_gapped.register	12✔
520	def _(self, seq: bytes) -> bool:	12✔
521	return self.is_gapped(seq.decode("utf8"))	×
522
523	@is_gapped.register	12✔
524	def _(self, seq: bytes) -> bool:	12✔
525	return self.is_gapped(seq.decode("utf8"))	12✔
526
527	@is_gapped.register	12✔
528	def _(self, seq: numpy.ndarray) -> bool:	12✔
529	return (seq == self.degen_gapped_alphabet.gap_index).any()	12✔
530
531	@functools.singledispatchmethod	12✔
532	def get_degenerate_positions(self, seq, include_gap=True) -> numpy.ndarray:	12✔
533	"""returns a boolean array indicating degenerate positions"""
NEW 534	raise TypeError(f"{type(seq)} not supported")	×
535
536	@get_degenerate_positions.register	12✔
537	def _(self, seq: numpy.ndarray, include_gap=True) -> numpy.ndarray:	12✔
538
539	for index, val in enumerate(self.degen_gapped_alphabet):	12✔
540	if include_gap and val in self.gap or val in self.ambiguities:	12✔
541	break	12✔
542	return seq >= index	12✔
543
544	@get_degenerate_positions.register	12✔
545	def _(self, seq: str, include_gap=True) -> numpy.ndarray:	12✔
546	return self.get_degenerate_positions(	12✔
547	self.degen_gapped_alphabet.to_indices(seq), include_gap
548	)
549
550	@get_degenerate_positions.register	12✔
551	def _(self, seq: bytes, include_gap=True) -> numpy.ndarray:	12✔
552	return self.get_degenerate_positions(	12✔
553	self.degen_gapped_alphabet.to_indices(seq), include_gap
554	)
555
556	def get_css_style(	12✔
557	self,
558	colors: typing.Optional[dict[str, str]] = None,
559	font_size: int = 12,
560	font_family="Lucida Console",
561	):
562	"""returns string of CSS classes and {character: <CSS class name>, ...}
563
564	Parameters
565	----------
566	colors
567	A dictionary mapping characters to CSS color values.
568	font_size
569	Font size in points.
570	font_family
571	Name of a monospace font.
572
573	"""
574	colors = colors or self._colors	12✔
575	# !important required to stop some browsers over-riding the style sheet ...!!
576	template = (	12✔
577	'.%s_%s{font-family: "%s",monospace !important; '
578	"font-size: %dpt !important; color: %s; }"
579	)
580	label = self.label or ""	12✔
581	styles = _STYLE_DEFAULTS[label].copy()	12✔
582	styles.update(	12✔
583	{c: "_".join([c, label]) for c in list(self.alphabet) + ["terminal_ambig"]}
584	)
585
586	css = [	12✔
587	template % (char, label, font_family, font_size, colors[char])
588	for char in list(styles) + ["ambig"]
589	]
590
591	return css, styles	12✔
592
593
594	def _make_moltype_dict() -> dict[str, MolType]:	12✔
595	"""make a dictionary of local name space molecular types"""
596	env = globals()	12✔
597	moltypes = {}	12✔
598	for obj in env.values():	12✔
599	if not isinstance(obj, MolType):	12✔
600	continue	12✔
601	moltypes[obj.name] = obj	12✔
602
603	return moltypes	12✔
604
605
606	def get_moltype(name: typing.Union[str, MolType]) -> MolType:	12✔
607	"""returns the moltype with the matching name attribute"""
608	if isinstance(name, MolType):	12✔
609	return name	12✔
610	name = name.lower()	12✔
611	if name not in _moltypes:	12✔
612	raise ValueError(f"unknown moltype {name!r}")	×
613	return _moltypes[name]	12✔
614
615
616	def available_moltypes():	12✔
617	"""returns Table listing the available moltypes"""
618	from cogent3.util.table import Table	12✔
619
620	rows = []	12✔
621	for n, m in _moltypes.items():	12✔
622	v = str(m)	12✔
623	num = len(list(m))	12✔
624	if num > 10:	12✔
625	v = f"{v[:39]}..."	12✔
626	rows.append([n, num, v])	12✔
627
628	header = ["Abbreviation", "Number of states", "Moltype"]	12✔
629	title = "Specify a moltype by the Abbreviation (case insensitive)."	12✔
630
631	result = Table(header=header, data=rows, title=title, index_name="Abbreviation")	12✔
632	result = result.sorted(columns=["Number of states", "Abbreviation"])	12✔
633	result.format_column("Abbreviation", repr)	12✔
634	return result	12✔
635
636
637	# constant instances of the core molecular types
638	ASCII = MolType(	12✔
639	# A default type for text read from a file etc. when we don't
640	# want to prematurely assume DNA or Protein.
641	monomers="".join(ascii_letters),
642	name="text",
643	make_seq=new_sequence.Sequence,
644	)
645
646	DNA = MolType(	12✔
647	monomers="".join(IUPAC_DNA_chars),
648	ambiguities=IUPAC_DNA_ambiguities,
649	name="dna",
650	complements=IUPAC_DNA_ambiguities_complements,
651	colors=NT_COLORS,
652	make_seq=new_sequence.DnaSequence,
653	)
654
655	RNA = MolType(	12✔
656	monomers="".join(IUPAC_RNA_chars),
657	ambiguities=IUPAC_RNA_ambiguities,
658	name="rna",
659	complements=IUPAC_RNA_ambiguities_complements,
660	colors=NT_COLORS,
661	make_seq=new_sequence.RnaSequence,
662	pairing_rules=RNA_STANDARD_PAIRS,
663	)
664	#
665	PROTEIN = MolType(	12✔
666	monomers="".join(IUPAC_PROTEIN_chars),
667	ambiguities=IUPAC_PROTEIN_ambiguities,
668	name="protein",
669	colors=AA_COLORS,
670	make_seq=new_sequence.ProteinSequence,
671	)
672
673	PROTEIN_WITH_STOP = MolType(	12✔
674	monomers="".join(PROTEIN_WITH_STOP_chars),
675	ambiguities=PROTEIN_WITH_STOP_ambiguities,
676	name="protein_with_stop",
677	colors=AA_COLORS,
678	make_seq=new_sequence.ProteinWithStopSequence,
679	)
680	BYTES = MolType(	12✔
681	# A default type for arbitrary chars read from a file etc. when we don't
682	# want to prematurely assume _anything_ about the data.
683	monomers=bytes(bytearray(range(2**8))),
684	name="bytes",
685	gap=None,
686	missing=None,
687	make_seq=new_sequence.ByteSequence,
688	)
689
690	# the None value catches cases where a moltype has no label attribute
691	_STYLE_DEFAULTS = {	12✔
692	getattr(mt, "label", ""): defaultdict(
693	_DefaultValue(f"ambig_{getattr(mt, 'label', '')}")
694	)
695	for mt in (ASCII, BYTES, DNA, RNA, PROTEIN, PROTEIN_WITH_STOP, None)
696	}
697
698	# build this at end of file
699	_moltypes = _make_moltype_dict()	12✔

rmcar17 / cogent3 / 9412281846

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous