• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rmcar17 / cogent3 / 9412281846

07 Jun 2024 02:10AM UTC coverage: 90.379%. First build
9412281846

push

github

web-flow
Merge pull request #1890 from KatherineCaley/seq-collections-refactor

SequenceCollection refactor - general and annotation db methods

191 of 265 new or added lines in 3 files covered. (72.08%)

31788 of 35172 relevant lines covered (90.38%)

10.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.55
/src/cogent3/core/new_moltype.py
1
import dataclasses
12✔
2
import functools
12✔
3
import itertools
12✔
4
import typing
12✔
5

6
from collections import defaultdict
12✔
7
from string import ascii_letters
12✔
8

9
import numpy
12✔
10

11
from cogent3.core import new_alphabet, new_sequence
12✔
12

13

14
OptStr = typing.Optional[str]
12✔
15
OptCallable = typing.Optional[typing.Callable]
12✔
16
SeqStrType = typing.Union[list[str], tuple[str, ...]]
12✔
17
StrORBytes = typing.Union[str, bytes]
12✔
18
StrORBytesORArray = typing.Union[str, bytes, numpy.ndarray]
12✔
19
SeqStrBytesType = typing.Union[list[StrORBytes], tuple[StrORBytes, ...]]
12✔
20
StrORArray = typing.Union[str, numpy.ndarray]
12✔
21

22
IUPAC_gap = "-"
12✔
23

24
IUPAC_missing = "?"
12✔
25

26
IUPAC_DNA_chars = "T", "C", "A", "G"
12✔
27
IUPAC_DNA_ambiguities = {
12✔
28
    "N": frozenset(("A", "C", "T", "G")),
29
    "R": frozenset(("A", "G")),
30
    "Y": frozenset(("C", "T")),
31
    "W": frozenset(("A", "T")),
32
    "S": frozenset(("C", "G")),
33
    "K": frozenset(("T", "G")),
34
    "M": frozenset(("C", "A")),
35
    "B": frozenset(("C", "T", "G")),
36
    "D": frozenset(("A", "T", "G")),
37
    "H": frozenset(("A", "C", "T")),
38
    "V": frozenset(("A", "C", "G")),
39
}
40
IUPAC_DNA_ambiguities_complements = {
12✔
41
    "A": "T",
42
    "C": "G",
43
    "G": "C",
44
    "T": "A",
45
    "-": "-",
46
    "M": "K",
47
    "K": "M",
48
    "N": "N",
49
    "R": "Y",
50
    "Y": "R",
51
    "W": "W",
52
    "S": "S",
53
    "X": "X",  # not technically an IUPAC ambiguity, but used by repeatmasker
54
    "V": "B",
55
    "B": "V",
56
    "H": "D",
57
    "D": "H",
58
    "?": "?",
59
}
60

61
IUPAC_DNA_complements = {"A": "T", "C": "G", "G": "C", "T": "A", "-": "-"}
12✔
62
# Standard DNA pairing: only Watson-Crick pairs count as pairs
63
DNA_STANDARD_PAIRS = {
12✔
64
    frozenset(("A", "T")): True,
65
    frozenset(("C", "G")): True,
66
}
67

68
# note change in standard order from DNA
69
IUPAC_RNA_chars = ["U", "C", "A", "G"]
12✔
70
IUPAC_RNA_ambiguities = {
12✔
71
    "N": frozenset(("A", "C", "U", "G")),
72
    "R": frozenset(("A", "G")),
73
    "Y": frozenset(("C", "U")),
74
    "W": frozenset(("A", "U")),
75
    "S": frozenset(("C", "G")),
76
    "K": frozenset(("U", "G")),
77
    "M": frozenset(("C", "A")),
78
    "B": frozenset(("C", "U", "G")),
79
    "D": frozenset(("A", "U", "G")),
80
    "H": frozenset(("A", "C", "U")),
81
    "V": frozenset(("A", "C", "G")),
82
}
83

84
IUPAC_RNA_ambiguities_complements = {
12✔
85
    "A": "U",
86
    "C": "G",
87
    "G": "C",
88
    "U": "A",
89
    "-": "-",
90
    "M": "K",
91
    "K": "M",
92
    "N": "N",
93
    "R": "Y",
94
    "Y": "R",
95
    "W": "W",
96
    "S": "S",
97
    "X": "X",  # not technically an IUPAC ambiguity, but used by repeatmasker
98
    "V": "B",
99
    "B": "V",
100
    "H": "D",
101
    "D": "H",
102
    "?": "?",
103
}
104

105
IUPAC_RNA_complements = {"A": "U", "C": "G", "G": "C", "U": "A", "-": "-"}
12✔
106

107
# Standard RNA pairing: GU pairs count as 'weak' pairs
108
RNA_STANDARD_PAIRS = {
12✔
109
    frozenset(("A", "U")): True,  # True vs False for 'always' vs 'sometimes' pairing
110
    frozenset(("C", "G")): True,
111
    frozenset(("G", "U")): False,
112
}
113

114
# Watson-Crick RNA pairing only: GU pairs don't count as pairs
115
RNA_W_C_PAIRS = {
12✔
116
    frozenset(("A", "U")): True,
117
    frozenset(("C", "G")): True,
118
    frozenset(("U", "A")): True,
119
}
120

121
# RNA pairing with GU counted as standard pairs
122
RNA_G_U_PAIRS = {
12✔
123
    frozenset(("A", "U")): True,
124
    frozenset(("C", "G")): True,
125
    frozenset(("G", "C")): True,
126
    frozenset(("U", "A")): True,
127
    frozenset(("G", "U")): True,
128
    frozenset(("U", "G")): True,
129
}
130

131
# RNA pairing with GU, AA, GA, CA and UU mismatches allowed as weak pairs
132
RNA_EXTENDED_PAIRS = {
12✔
133
    frozenset({"A", "U"}): True,
134
    frozenset({"C", "G"}): True,
135
    frozenset({"G", "U"}): False,
136
    frozenset({"A"}): False,
137
    frozenset({"A", "G"}): False,
138
    frozenset({"A", "C"}): False,
139
    frozenset({"U"}): False,
140
}
141

142

143
def make_pairs(
12✔
144
    *,
145
    pairs: dict[tuple[str, str], bool] = None,
146
    monomers: tuple[str, ...] = None,
147
    gaps: str = None,
148
    degenerates: dict[str, set[str]] = None,
149
) -> dict[frozenset[str], bool]:
150
    """Makes a dict of symbol pairs (i,j) -> strictness.
151

152
    Expands pairs into all possible pairs using degen symbols.
153
    Strictness is True if i and j always pair, and False if they 'weakly' pair
154
    (e.g. GU pairs or if it is possible that they pair).
155

156
    If you want to make GU pairs count as 'always matching', pass in pairs
157
    that have (G,U) and (U, G) mapped to True rather than False.
158
    """
159
    result = {}
12✔
160
    pairs = pairs or {}
12✔
161
    monomers = monomers or ()
12✔
162
    gaps = gaps or ()
12✔
163
    degenerates = degenerates or {}
12✔
164

165
    result |= pairs
12✔
166
    result |= {frozenset((i, j)): False for i in gaps for j in gaps}
12✔
167
    for b, d in itertools.product(monomers, degenerates):
12✔
168
        if any(frozenset((b, e)) in pairs for e in degenerates[d]):
12✔
169
            result[frozenset((b, d))] = False
12✔
170

171
    for d1, d2 in itertools.combinations_with_replacement(degenerates, 2):
12✔
172
        if any(
12✔
173
            frozenset((e1, e2)) in pairs
174
            for e1 in degenerates[d1]
175
            for e2 in degenerates[d2]
176
        ):
177
            result[frozenset((d1, d2))] = False
12✔
178

179
    return result
12✔
180

181

182
# RNA_PAIRING_RULES is a dict of {name:(base_pairs,degen_pairs)} where base_pairs
183
# is a dict with the non-degenerate pairing rules and degen_pairs is a dict with
184
# both the degenerate and non-degenerate pairing rules.
185
# NOTE: uses make_pairs to augment the initial dict after construction.
186
def _build_pairing_rules() -> dict[frozenset[str], bool]:
12✔
187
    pairing_rules = {
12✔
188
        "Standard": RNA_STANDARD_PAIRS,
189
        "WC": RNA_W_C_PAIRS,
190
        "GU": RNA_G_U_PAIRS,
191
        "Extended": RNA_EXTENDED_PAIRS,
192
    }
193
    for k, v in list(pairing_rules.items()):
12✔
194
        pairing_rules[k] = (v, make_pairs(pairs=v))
12✔
195
    return pairing_rules
12✔
196

197

198
RNA_PAIRING_RULES = _build_pairing_rules()
12✔
199

200
# protein letters & ambiguity codes
201
IUPAC_PROTEIN_code_aa = {
12✔
202
    "A": "Alanine",
203
    "C": "Cysteine",
204
    "D": "Aspartic Acid",
205
    "E": "Glutamic Acid",
206
    "F": "Phenylalanine",
207
    "G": "Glycine",
208
    "H": "Histidine",
209
    "I": "Isoleucine",
210
    "K": "Lysine",
211
    "L": "Leucine",
212
    "M": "Methionine",
213
    "N": "Asparagine",
214
    "P": "Proline",
215
    "Q": "Glutamine",
216
    "R": "Arginine",
217
    "S": "Serine",
218
    "T": "Threonine",
219
    "V": "Valine",
220
    "W": "Tryptophan",
221
    "Y": "Tyrosine",
222
    "*": "STOP",
223
}
224

225
IUPAC_PROTEIN_chars = (
12✔
226
    "A",
227
    "C",
228
    "D",
229
    "E",
230
    "F",
231
    "G",
232
    "H",
233
    "I",
234
    "K",
235
    "L",
236
    "M",
237
    "N",
238
    "P",
239
    "Q",
240
    "R",
241
    "S",
242
    "T",
243
    "U",
244
    "V",
245
    "W",
246
    "Y",
247
)
248

249
PROTEIN_WITH_STOP_chars = (
12✔
250
    "A",
251
    "C",
252
    "D",
253
    "E",
254
    "F",
255
    "G",
256
    "H",
257
    "I",
258
    "K",
259
    "L",
260
    "M",
261
    "N",
262
    "P",
263
    "Q",
264
    "R",
265
    "S",
266
    "T",
267
    "U",
268
    "V",
269
    "W",
270
    "Y",
271
    "*",
272
)
273

274
IUPAC_PROTEIN_ambiguities = {"B": ["N", "D"], "X": IUPAC_PROTEIN_chars, "Z": ["Q", "E"]}
12✔
275

276
PROTEIN_WITH_STOP_ambiguities = {
12✔
277
    "B": ["N", "D"],
278
    "X": PROTEIN_WITH_STOP_chars,
279
    "Z": ["Q", "E"],
280
}
281

282
# styling for moltype display
283

284

285
def _expand_colors(base, colors):
12✔
286
    base = base.copy()
12✔
287
    base.update({ch: clr for chars, clr in colors.items() for ch in chars})
12✔
288
    return base
12✔
289

290

291
class _DefaultValue:
12✔
292
    def __init__(self, value):
12✔
293
        self.value = value
12✔
294

295
    def __call__(self):
12✔
296
        return self.value
12✔
297

298

299
_gray = _DefaultValue("gray")
12✔
300
_base_colors = defaultdict(_gray)
12✔
301

302
NT_COLORS = _expand_colors(
12✔
303
    _base_colors, {"A": "#FF0102", "C": "black", "G": "green", "T": "blue", "U": "blue"}
304
)
305

306
AA_COLORS = _expand_colors(
12✔
307
    _base_colors,
308
    {
309
        "GAVLI": "#009999",
310
        "FYW": "#ff6600",
311
        "CM": "orange",
312
        "ST": "#009900",
313
        "KRH": "#FF0102",
314
        "DE": "blue",
315
        "NQ": "#993300",
316
        "P": "#cc0099",
317
    },
318
)
319

320

321
@dataclasses.dataclass
12✔
322
class MolType:
12✔
323
    name: str
12✔
324
    monomers: dataclasses.InitVar[StrORBytes]
12✔
325
    make_seq: dataclasses.InitVar[typing.Type]
12✔
326
    gap: OptStr = IUPAC_gap
12✔
327
    missing: OptStr = IUPAC_missing
12✔
328
    complements: dataclasses.InitVar[typing.Optional[dict[str, str]]] = None
12✔
329
    ambiguities: typing.Optional[dict[str, tuple[str, ...]]] = None
12✔
330
    colors: dataclasses.InitVar[typing.Optional[dict[str, str]]] = None
12✔
331
    pairing_rules: typing.Optional[dict[str, dict[frozenset[str], bool]]] = None
12✔
332

333
    # private attributes to be delivered via properties
334
    _monomers: new_alphabet.CharAlphabet = dataclasses.field(init=False)
12✔
335
    _gapped: new_alphabet.CharAlphabet = dataclasses.field(init=False)
12✔
336
    _degen: new_alphabet.CharAlphabet = dataclasses.field(init=False)
12✔
337
    _degen_gapped: new_alphabet.CharAlphabet = dataclasses.field(init=False)
12✔
338
    _colors: dict[str, str] = dataclasses.field(init=False)
12✔
339

340
    # how to connect this to the sequence constructor and avoid
341
    # circular imports
342
    _make_seq: typing.Callable = dataclasses.field(init=False)
12✔
343
    _complement: OptCallable = dataclasses.field(init=False, default=None)
12✔
344

345
    def __post_init__(
12✔
346
        self,
347
        monomers: StrORBytes,
348
        make_seq: typing.Type,
349
        complements: typing.Optional[dict[str, str]],
350
        colors: typing.Optional[dict[str, str]],
351
    ):
352
        self._colors = colors or defaultdict(_DefaultValue("black"))
12✔
353
        self._make_seq = make_seq
12✔
354
        gap = new_alphabet._coerce_to_type(monomers, self.gap or "")
12✔
355
        missing = new_alphabet._coerce_to_type(monomers, self.missing or "")
12✔
356
        ambigs = new_alphabet._coerce_to_type(monomers, "".join(self.ambiguities or ""))
12✔
357

358
        self._monomers = new_alphabet.make_alphabet(
12✔
359
            chars=monomers, gap=None, moltype=self
360
        )
361
        self._degen = (
12✔
362
            new_alphabet.make_alphabet(chars=monomers + ambigs, gap=None, moltype=self)
363
            if ambigs
364
            else None
365
        )
366
        self._gapped = (
12✔
367
            new_alphabet.make_alphabet(chars=monomers + gap, gap=self.gap, moltype=self)
368
            if gap
369
            else None
370
        )
371
        self._degen_gapped = (
12✔
372
            new_alphabet.make_alphabet(
373
                chars=monomers + gap + ambigs + missing, gap=self.gap, moltype=self
374
            )
375
            if ambigs and gap
376
            else None
377
        )
378
        if complements:
12✔
379
            # assume we have a nucleic acid moltype
380
            dest = "".join(complements[c] for c in self.degen_gapped_alphabet)
12✔
381
            self._complement = new_alphabet.convert_alphabet(
12✔
382
                self.degen_gapped_alphabet.as_bytes(),
383
                dest.encode("utf8"),
384
            )
385

386
    def __repr__(self):
12✔
387
        name = self.__class__.__name__
12✔
388
        return f"{name}({self.alphabet})"
12✔
389

390
    def __hash__(self):
12✔
391
        return id(self)
×
392

393
    def __eq__(self, other):
12✔
394
        return id(self) == id(other)
×
395

396
    def __len__(self) -> int:
12✔
397
        return len(self._monomers)
12✔
398

399
    def __iter__(self):
12✔
400
        yield from self._monomers
12✔
401

402
    @property
12✔
403
    def label(self):
12✔
404
        """synonym for name"""
405
        return self.name
12✔
406

407
    @property
12✔
408
    def alphabet(self):
12✔
409
        """monomers"""
410
        return self._monomers
12✔
411

412
    @property
12✔
413
    def degen_alphabet(self):
12✔
414
        """monomers + ambiguous characters"""
415
        return self._degen
×
416

417
    @property
12✔
418
    def gapped_alphabet(self):
12✔
419
        """monomers + gap"""
420
        return self._gapped
12✔
421

422
    @property
12✔
423
    def degen_gapped_alphabet(self):
12✔
424
        """monomers + gap + ambiguous characters"""
425
        return self._degen_gapped
12✔
426

427
    def is_valid(self, seq: StrORArray) -> bool:
12✔
428
        """checks against most degenerate alphabet"""
429
        alpha = next(
12✔
430
            alpha
431
            for alpha in (self._degen_gapped, self._degen, self._gapped, self._monomers)
432
            if alpha
433
        )
434
        return alpha.is_valid(seq)
12✔
435

436
    def iter_alphabets(self):
12✔
437
        """yield the different defined alphabets"""
438
        alphas = (self._monomers, self._gapped, self._degen, self._degen_gapped)
12✔
439
        yield from (a for a in alphas if a)
12✔
440

441
    def is_compatible_alphabet(
12✔
442
        self, alphabet: new_alphabet.CharAlphabet, strict: bool = True
443
    ) -> bool:
444
        """checks that characters in alphabet are equal to a bound alphabet
445

446
        Parameters
447
        ----------
448
        alphabet
449
            an Alphabet instance
450
        strict
451
            the order of elements must match
452
        """
453
        if not strict:
12✔
454
            query = set(alphabet)
×
455
            return any(set(alpha) == query for alpha in self.iter_alphabets())
×
456

457
        return any(alpha == alphabet for alpha in self.iter_alphabets())
12✔
458

459
    def make_seq(self, *, seq: str, name: OptStr = None, check_seq=True, **kwargs):
12✔
460
        if check_seq:
12✔
461
            assert self.is_valid(
12✔
462
                seq
463
            ), f"{seq[:4]!r} not valid for moltype {self.name!r}"
464
        return self._make_seq(moltype=self, seq=seq or "", name=name, **kwargs)
12✔
465

466
    @functools.singledispatchmethod
12✔
467
    def complement(self, seq: StrORBytesORArray) -> str:
12✔
468
        """converts a string or bytes into it's nucleic acid complement"""
469
        raise TypeError(f"{type(seq)} not supported")
×
470

471
    @complement.register
12✔
472
    def _(self, seq: str) -> str:
12✔
473
        return self.complement(seq.encode("utf8"))
12✔
474

475
    @complement.register
12✔
476
    def _(self, seq: bytes) -> str:
12✔
477
        return self._complement(seq).decode("utf8")
12✔
478

479
    @complement.register
12✔
480
    def _(self, seq: numpy.ndarray) -> str:
12✔
481
        return self.complement(self.degen_gapped_alphabet.array_to_bytes(seq))
12✔
482

483
    def rc(self, seq: str) -> str:
12✔
484
        """reverse reverse complement of a sequence"""
485
        return self.complement(seq)[::-1]
12✔
486

487
    @functools.singledispatchmethod
12✔
488
    def is_degenerate(self, seq: StrORBytesORArray) -> bool:
12✔
489
        """checks if a sequence contains degenerate characters"""
490
        raise TypeError(f"{type(seq)} not supported")
12✔
491

492
    @is_degenerate.register
12✔
493
    def _(self, seq: bytes) -> bool:
12✔
494
        return self.is_degenerate(self.degen_gapped_alphabet.to_indices(seq))
12✔
495

496
    @is_degenerate.register
12✔
497
    def _(self, seq: str) -> bool:
12✔
498
        return self.is_degenerate(self.degen_gapped_alphabet.to_indices(seq))
12✔
499

500
    @is_degenerate.register
12✔
501
    def _(self, seq: numpy.ndarray) -> bool:
12✔
502
        # what index is the first degenerate character
503
        for index, val in enumerate(self.degen_gapped_alphabet):
12✔
504
            if val in self.ambiguities:
12✔
505
                break
12✔
506
        else:
507
            return False
×
508
        return (seq >= index).any()
12✔
509

510
    @functools.singledispatchmethod
12✔
511
    def is_gapped(self, seq) -> bool:
12✔
512
        """checks if a sequence contains gaps"""
513
        raise TypeError(f"{type(seq)} not supported")
×
514

515
    @is_gapped.register
12✔
516
    def _(self, seq: str) -> bool:
12✔
517
        return self.gapped_alphabet.gap_char in seq
12✔
518

519
    @is_gapped.register
12✔
520
    def _(self, seq: bytes) -> bool:
12✔
521
        return self.is_gapped(seq.decode("utf8"))
×
522

523
    @is_gapped.register
12✔
524
    def _(self, seq: bytes) -> bool:
12✔
525
        return self.is_gapped(seq.decode("utf8"))
12✔
526

527
    @is_gapped.register
12✔
528
    def _(self, seq: numpy.ndarray) -> bool:
12✔
529
        return (seq == self.degen_gapped_alphabet.gap_index).any()
12✔
530

531
    @functools.singledispatchmethod
12✔
532
    def get_degenerate_positions(self, seq, include_gap=True) -> numpy.ndarray:
12✔
533
        """returns a boolean array indicating degenerate positions"""
NEW
534
        raise TypeError(f"{type(seq)} not supported")
×
535

536
    @get_degenerate_positions.register
12✔
537
    def _(self, seq: numpy.ndarray, include_gap=True) -> numpy.ndarray:
12✔
538

539
        for index, val in enumerate(self.degen_gapped_alphabet):
12✔
540
            if include_gap and val in self.gap or val in self.ambiguities:
12✔
541
                break
12✔
542
        return seq >= index
12✔
543

544
    @get_degenerate_positions.register
12✔
545
    def _(self, seq: str, include_gap=True) -> numpy.ndarray:
12✔
546
        return self.get_degenerate_positions(
12✔
547
            self.degen_gapped_alphabet.to_indices(seq), include_gap
548
        )
549

550
    @get_degenerate_positions.register
12✔
551
    def _(self, seq: bytes, include_gap=True) -> numpy.ndarray:
12✔
552
        return self.get_degenerate_positions(
12✔
553
            self.degen_gapped_alphabet.to_indices(seq), include_gap
554
        )
555

556
    def get_css_style(
12✔
557
        self,
558
        colors: typing.Optional[dict[str, str]] = None,
559
        font_size: int = 12,
560
        font_family="Lucida Console",
561
    ):
562
        """returns string of CSS classes and {character: <CSS class name>, ...}
563

564
        Parameters
565
        ----------
566
        colors
567
             A dictionary mapping characters to CSS color values.
568
        font_size
569
            Font size in points.
570
        font_family
571
            Name of a monospace font.
572

573
        """
574
        colors = colors or self._colors
12✔
575
        # !important required to stop some browsers over-riding the style sheet ...!!
576
        template = (
12✔
577
            '.%s_%s{font-family: "%s",monospace !important; '
578
            "font-size: %dpt !important; color: %s; }"
579
        )
580
        label = self.label or ""
12✔
581
        styles = _STYLE_DEFAULTS[label].copy()
12✔
582
        styles.update(
12✔
583
            {c: "_".join([c, label]) for c in list(self.alphabet) + ["terminal_ambig"]}
584
        )
585

586
        css = [
12✔
587
            template % (char, label, font_family, font_size, colors[char])
588
            for char in list(styles) + ["ambig"]
589
        ]
590

591
        return css, styles
12✔
592

593

594
def _make_moltype_dict() -> dict[str, MolType]:
12✔
595
    """make a dictionary of local name space molecular types"""
596
    env = globals()
12✔
597
    moltypes = {}
12✔
598
    for obj in env.values():
12✔
599
        if not isinstance(obj, MolType):
12✔
600
            continue
12✔
601
        moltypes[obj.name] = obj
12✔
602

603
    return moltypes
12✔
604

605

606
def get_moltype(name: typing.Union[str, MolType]) -> MolType:
12✔
607
    """returns the moltype with the matching name attribute"""
608
    if isinstance(name, MolType):
12✔
609
        return name
12✔
610
    name = name.lower()
12✔
611
    if name not in _moltypes:
12✔
612
        raise ValueError(f"unknown moltype {name!r}")
×
613
    return _moltypes[name]
12✔
614

615

616
def available_moltypes():
12✔
617
    """returns Table listing the available moltypes"""
618
    from cogent3.util.table import Table
12✔
619

620
    rows = []
12✔
621
    for n, m in _moltypes.items():
12✔
622
        v = str(m)
12✔
623
        num = len(list(m))
12✔
624
        if num > 10:
12✔
625
            v = f"{v[:39]}..."
12✔
626
        rows.append([n, num, v])
12✔
627

628
    header = ["Abbreviation", "Number of states", "Moltype"]
12✔
629
    title = "Specify a moltype by the Abbreviation (case insensitive)."
12✔
630

631
    result = Table(header=header, data=rows, title=title, index_name="Abbreviation")
12✔
632
    result = result.sorted(columns=["Number of states", "Abbreviation"])
12✔
633
    result.format_column("Abbreviation", repr)
12✔
634
    return result
12✔
635

636

637
# constant instances of the core molecular types
638
ASCII = MolType(
12✔
639
    # A default type for text read from a file etc. when we don't
640
    # want to prematurely assume DNA or Protein.
641
    monomers="".join(ascii_letters),
642
    name="text",
643
    make_seq=new_sequence.Sequence,
644
)
645

646
DNA = MolType(
12✔
647
    monomers="".join(IUPAC_DNA_chars),
648
    ambiguities=IUPAC_DNA_ambiguities,
649
    name="dna",
650
    complements=IUPAC_DNA_ambiguities_complements,
651
    colors=NT_COLORS,
652
    make_seq=new_sequence.DnaSequence,
653
)
654

655
RNA = MolType(
12✔
656
    monomers="".join(IUPAC_RNA_chars),
657
    ambiguities=IUPAC_RNA_ambiguities,
658
    name="rna",
659
    complements=IUPAC_RNA_ambiguities_complements,
660
    colors=NT_COLORS,
661
    make_seq=new_sequence.RnaSequence,
662
    pairing_rules=RNA_STANDARD_PAIRS,
663
)
664
#
665
PROTEIN = MolType(
12✔
666
    monomers="".join(IUPAC_PROTEIN_chars),
667
    ambiguities=IUPAC_PROTEIN_ambiguities,
668
    name="protein",
669
    colors=AA_COLORS,
670
    make_seq=new_sequence.ProteinSequence,
671
)
672

673
PROTEIN_WITH_STOP = MolType(
12✔
674
    monomers="".join(PROTEIN_WITH_STOP_chars),
675
    ambiguities=PROTEIN_WITH_STOP_ambiguities,
676
    name="protein_with_stop",
677
    colors=AA_COLORS,
678
    make_seq=new_sequence.ProteinWithStopSequence,
679
)
680
BYTES = MolType(
12✔
681
    # A default type for arbitrary chars read from a file etc. when we don't
682
    # want to prematurely assume _anything_ about the data.
683
    monomers=bytes(bytearray(range(2**8))),
684
    name="bytes",
685
    gap=None,
686
    missing=None,
687
    make_seq=new_sequence.ByteSequence,
688
)
689

690
# the None value catches cases where a moltype has no label attribute
691
_STYLE_DEFAULTS = {
12✔
692
    getattr(mt, "label", ""): defaultdict(
693
        _DefaultValue(f"ambig_{getattr(mt, 'label', '')}")
694
    )
695
    for mt in (ASCII, BYTES, DNA, RNA, PROTEIN, PROTEIN_WITH_STOP, None)
696
}
697

698
# build this at end of file
699
_moltypes = _make_moltype_dict()
12✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc