• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Edinburgh-Genome-Foundry / DnaChisel / 11713171410

06 Nov 2024 10:48PM UTC coverage: 90.13% (+0.2%) from 89.915%
11713171410

push

github

web-flow
Merge pull request #90 from Edinburgh-Genome-Foundry/zulko/fix-missing-bio

Zulko/fix missing bio

2 of 2 new or added lines in 1 file covered. (100.0%)

8 existing lines in 1 file now uncovered.

2977 of 3303 relevant lines covered (90.13%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.62
/dnachisel/SequencePattern/MotifPssmPattern.py
1
from Bio.Seq import Seq
1✔
2
from Bio import motifs
1✔
3
from .SequencePattern import SequencePattern
1✔
4
import numpy as np
1✔
5

6

7
class MotifPssmPattern(SequencePattern):
1✔
8
    """Motif pattern represented by a Position-Specific Scoring Matrix (PSSM).
9

10
    This class is better instantiated by creating a pattern from a set of
11
    sequence strings with ``MotifPssmPattern.from_sequences()``,  or reading
12
    pattern(s) for a file with ``MotifPssmPattern.from_file()``.
13

14
    Parameters
15
    ----------
16

17
    pssm
18
      A `Bio.motifs.Motif` object.
19

20
    threshold
21
     locations of the sequence with a PSSM score above this value will be
22
     considered matches. For convenience, a relative_threshold can be
23
     given instead.
24

25
    relative_threshold
26
      Value between 0 and 1 from which the threshold will be auto-computed.
27
      0 means "match everything", 1 means "only match the one (or several)
28
      sequence(s) with the absolute highest possible score".
29
    """
30

31
    def __init__(self, pssm, threshold=None, relative_threshold=None):
1✔
32
        if not isinstance(pssm, motifs.Motif):
1✔
UNCOV
33
            raise ValueError(
×
34
                f"Expected PSSM type of `Bio.motifs.Motif`, but {type(pssm)} was passed"
35
            )
36
        self.name = pssm.name
1✔
37
        self.pssm = pssm.pssm
1✔
38
        if relative_threshold is not None:
1✔
39
            mini, maxi = self.pssm.min, self.pssm.max
1✔
40
            threshold = mini + relative_threshold * (maxi - mini)
1✔
41
        self.threshold = threshold
1✔
42
        self.relative_threshold = relative_threshold
1✔
43
        self.size = pssm.length
1✔
44
        self.pssm_matrix = np.array([self.pssm[n] for n in "ATGC"])
1✔
45
        self.is_palyndromic = False
1✔
46

47
    @classmethod
1✔
48
    def apply_pseudocounts(cls, motif, pseudocounts):
1✔
49
        """Add pseudocounts to the motif's pssm matrix.
50

51
        Add nothing if pseudocounts is None, auto-compute pseudocounts if
52
        pseudocounts="jaspar", else just attribute the pseudocounts as-is
53
        to the motif.
54
        """
55
        if pseudocounts is not None:
1✔
56
            if pseudocounts == "jaspar":
1✔
57
                pseudocounts = motifs.jaspar.calculate_pseudocounts(motif)
1✔
58
            motif.pseudocounts = pseudocounts
1✔
59

60
    def find_matches_in_string(self, sequence):
1✔
61
        """Find all positions where the PSSM score is above threshold."""
62

63
        # NOTE: Before, I made my PSSM searches with Biopython. It was looong!
64
        # Now I use Numpy and np.choice(), and I never looked back
65
        # sequence = Seq(sequence, alphabet=alphabet)
66
        # search = self.pssm.search(
67
        #     sequence, threshold=self.threshold, both=False
68
        # )
69
        indices = find_pssm_matches_with_numpy(
1✔
70
            pssm_matrix=self.pssm_matrix, sequence=sequence, threshold=self.threshold
71
        )
72
        return [(i, i + self.size, 1) for i in indices]
1✔
73

74
    @classmethod
1✔
75
    def from_sequences(
1✔
76
        cls,
77
        sequences,
78
        name="unnamed",
79
        pseudocounts="jaspar",
80
        threshold=None,
81
        relative_threshold=None,
82
    ):
83
        """Return a PSSM pattern computed from same-length sequences.
84

85
        Parameters
86
        ----------
87

88
        sequences
89
          A list of same-length sequences.
90

91
        name
92
          Name to give to the pattern (will appear in reports etc.).
93

94
        pseudocounts
95
          Either a dict {"A": 0.01, "T": ...} or "jaspar" for automatic
96
          pseudocounts from the Biopython.motifs.jaspar module (recommended),
97
          or None for no pseudocounts at all (not recommended!).
98

99
        threshold
100
          locations of the sequence with a PSSM score above this value will be
101
          considered matches. For convenience, a relative_threshold can be
102
          given instead.
103

104
        relative_threshold
105
          Value between 0 and 1 from which the threshold will be auto-computed.
106
          0 means "match everything", 1 means "only match the one (or several)
107
          sequence(s) with the absolute highest possible score".
108
        """
109
        sequences = [Seq(s) for s in sequences]
1✔
110
        motif = motifs.create(sequences)
1✔
111
        cls.apply_pseudocounts(motif, pseudocounts)
1✔
112
        motif.name = name
1✔
113
        pssm = motif
1✔
114
        return MotifPssmPattern(
1✔
115
            pssm=pssm, threshold=threshold, relative_threshold=relative_threshold
116
        )
117

118
    @classmethod
1✔
119
    def list_from_file(
1✔
120
        cls,
121
        motifs_file,
122
        file_format,
123
        threshold=None,
124
        pseudocounts="jaspar",
125
        relative_threshold=None,
126
    ):
127
        """Return a list of PSSM patterns from a file in JASPAR, MEME, etc.
128

129
        Parameters
130
        ----------
131

132
        motifs_file
133
          Path to a motifs file, or file handle.
134

135
        file_format
136
          File format. one of "jaspar", "meme", "TRANSFAC".
137

138
        pseudocounts
139
          Either a dict {"A": 0.01, "T": ...} or "jaspar" for automatic
140
          pseudocounts from the Biopython.motifs.jaspar module (recommended),
141
          or None for no pseudocounts at all (not recommended!).
142

143
        threshold
144
          locations of the sequence with a PSSM score above this value will be
145
          considered matches. For convenience, a relative_threshold can be
146
          given instead.
147

148
        relative_threshold
149
          Value between 0 and 1 from which the threshold will be auto-computed.
150
          0 means "match everything", 1 means "only match the one (or several)
151
          sequence(s) with the absolute highest possible score".
152
        """
153
        if isinstance(motifs_file, str):
1✔
154
            with open(motifs_file, "r") as f:
1✔
155
                motifs_list = motifs.parse(f, file_format)
1✔
156
        else:
157
            motifs_list = motifs.parse(motifs_file, file_format)
1✔
158
        if pseudocounts is not None:
1✔
159
            for motif in motifs_list:
1✔
160
                cls.apply_pseudocounts(motif, pseudocounts)
1✔
161

162
        return [
1✔
163
            MotifPssmPattern(
164
                pssm, threshold=threshold, relative_threshold=relative_threshold
165
            )
166
            for pssm in motifs_list
167
        ]
168

169
    def __str__(self):
1✔
170
        if self.relative_threshold is not None:
1✔
171
            threshold = "%d%%" % (100 * self.relative_threshold)
1✔
172
        else:
173
            threshold = "%.2f" % self.threshold
×
174
        return "%s-PSSM(%s+)" % (self.name, threshold)
1✔
175

176
    def __repr__(self):
1✔
177
        if self.relative_threshold is not None:
×
UNCOV
178
            threshold = "%d%%" % (100 * self.relative_threshold)
×
179
        else:
UNCOV
180
            threshold = "%.2f" % self.threshold
×
UNCOV
181
        return "%s-PSSM(%s+)" % (self.name, threshold)
×
182

183

184
def find_pssm_matches_with_numpy(pssm_matrix, sequence, threshold):
1✔
185
    """Return every index in the +1 strand wit a PSSM score above threshold.
186

187
    My numpy-based implementation is 10 times faster than Biopython for some
188
    reason. Weird. Can someone else check?
189

190
    Parameters:
191
    -----------
192

193
    pssm
194
      A matrix whose rows give the frequency motif of ATGC (in this order).
195

196
    sequence
197
      A string representing a DNA sequence.
198

199
    threshold
200
      Every index with a score above this threshold will be returned.
201
    """
202
    nucleotide_to_index = dict(zip("ATGC", range(4)))
1✔
203
    len_pattern = len(pssm_matrix[0])
1✔
204

205
    # If sequence is small, use normal python to avoid numpy overhead
206

207
    if len(sequence) < 60:
1✔
208
        nucl_indices = [nucleotide_to_index[n] for n in sequence]
1✔
209
        return [
1✔
210
            i
211
            for i in range(len(sequence) - len_pattern)
212
            if np.choose(nucl_indices[i : len_pattern + i], pssm_matrix).sum()
213
            >= threshold
214
        ]
215

216
    # If sequence is large, use Numpy for speed. tested experimentally
217

UNCOV
218
    nucl_indices = np.array([nucleotide_to_index[n] for n in sequence], dtype="uint8")
×
UNCOV
219
    len_pattern = len(pssm_matrix[0])
×
UNCOV
220
    scores = np.array(
×
221
        [
222
            np.choose(nucl_indices[k : len_pattern + k], pssm_matrix).sum()
223
            for k in range(len(sequence) - len_pattern)
224
        ]
225
    )
UNCOV
226
    return np.nonzero(scores >= threshold)[0]
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc