• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Edinburgh-Genome-Foundry / DnaChisel / 12994342560

27 Jan 2025 05:03PM UTC coverage: 90.054% (-0.1%) from 90.191%
12994342560

push

github

veghp
Bump to v3.2.13

1 of 1 new or added line in 1 file covered. (100.0%)

8 existing lines in 2 files now uncovered.

2979 of 3308 relevant lines covered (90.05%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.28
/dnachisel/builtin_specifications/UniquifyAllKmers.py
1
"""Implement UniquifyAllKmers(Specification)"""
2

3
from collections import defaultdict
1✔
4

5
from ..Specification import Specification
1✔
6

7
# from .VoidSpecification import VoidSpecification
8
from ..Specification.SpecEvaluation import SpecEvaluation
1✔
9
from ..biotools import reverse_complement
1✔
10
from ..Location import Location
1✔
11

12
from functools import lru_cache
1✔
13

14

15
def get_kmer_extractor(sequence, include_reverse_complement=True, k=1):
1✔
16
    """Return a function (i => standardized_kmer_string)."""
17
    if include_reverse_complement:
1✔
18
        rev_comp_sequence = reverse_complement(sequence)
1✔
19
        L = len(sequence)
1✔
20

21
        def extract_kmer(i):
1✔
22
            subsequence = sequence[i : i + k]
1✔
23
            rev_comp = rev_comp_sequence[L - i - k : L - i]
1✔
24
            return min(subsequence, rev_comp)
1✔
25

26
    else:
27

28
        def extract_kmer(i):
×
29
            return sequence[i : i + k]
×
30

31
    return extract_kmer
1✔
32

33

34
@lru_cache(maxsize=1)
1✔
35
def get_kmer_extractor_cached(sequence, include_reverse_complement=True, k=1):
1✔
36
    """Kmer extractor with memoization.
37

38
    This globally cached method enables much faster computations when
39
    several UniquifyAllKmers functions with equal k are used.
40
    """
41
    L = len(sequence)
1✔
42
    if include_reverse_complement:
1✔
43
        rev_comp_sequence = reverse_complement(sequence)
1✔
44

45
        @lru_cache(maxsize=L)
1✔
46
        def extract_kmer(i):
1✔
47
            subsequence = sequence[i : i + k]
1✔
48
            rev_comp = rev_comp_sequence[L - i - k : L - i]
1✔
49
            return min(subsequence, rev_comp)
1✔
50

51
    else:
52

53
        @lru_cache(maxsize=L)
1✔
54
        def extract_kmer(i):
1✔
55
            return sequence[i : i + k]
1✔
56

57
    return extract_kmer
1✔
58

59

60
class UniquifyAllKmers(Specification):
1✔
61
    """Avoid sub-sequence of length k with homologies elsewhere.
62

63
    NOTE: For sequences with subsequences appearing more than 2 times, the
64
    specification may not work as a problem constraint, but will work as a
65
    problem optimization objective.
66

67
    You can define a location L and an reference L* (by default they
68
    are both the full sequence)
69

70
    >>>          [=== L ===]
71
    >>>
72
    >>>       [=========== L* ==========]
73
    >>>
74
    >>> --------- Sequence --------------------------
75

76
    This Specification class specifies that "No sub-sequence in L of length
77
    above k has more than 1 occurence in L*".
78

79
    Some specific cases
80

81
    - L = L* = Sequence. In this case the full sequence will have only unique
82
      kmers above a certain size (no self-homology).
83
    - L < L*, L* = Sequence. The segment L will have no self-homology and no
84
      homology to the rest of the sequence above a certain size. But there
85
      can be self-homologies elsewhere in the sequence.
86
    - L = L*. segment L will have no self-homology.
87

88
    Parameters
89
    ----------
90
    k
91
      Minimal length of sequences to be considered repeats
92

93
    reference
94
      The default None indicates that the specification's location should have
95
      no homologies anywhere in the whole sequence. If reference="here", then
96
      the specification's location should have no homology inside that same
97
      location. Reference can also be any location of the sequence that the
98
      specification's location should have no homologies with.
99

100
    location
101
      Segment of the sequence in which to look for repeats. If None, repeats
102
      are searched in the full sequence.
103

104
    include_reverse_complement
105
      If True, the sequence repeats are also searched for in the reverse
106
      complement of the sequence (or sub sequence if `location` is not None).
107

108
    Examples
109
    --------
110

111
    >>> from dnachisel import *
112
    >>> sequence = random_dna_sequence(50000)
113
    >>> constraint= UniquifyAllKmers(10, include_reverse_complement=True)
114
    >>> problem = DnaOptimizationProblem(sequence, constraints= [constraint])
115
    >>> print (problem.constraints_summary())
116
    """
117

118
    best_possible_score = 0
1✔
119
    use_cache = True
1✔
120
    shorthand_name = "all_unique_kmers"
1✔
121

122
    def __init__(
1✔
123
        self,
124
        k,
125
        reference=None,
126
        location=None,
127
        include_reverse_complement=True,
128
        boost=1.0,
129
        localization_data=None,
130
    ):
131
        """Initialize."""
132
        self.k = k
1✔
133
        self.location = Location.from_data(location)
1✔
134
        if reference in ["here", "same"]:
1✔
135
            reference = location
×
136
        if isinstance(reference, tuple):
1✔
137
            reference = Location.from_tuple(reference)
×
138
        self.reference = reference
1✔
139
        self.include_reverse_complement = include_reverse_complement
1✔
140
        self.boost = boost
1✔
141
        self.localization_data = localization_data
1✔
142

143
    def initialized_on_problem(self, problem, role="constraint"):
1✔
144
        """Location is the full sequence by default."""
145

146
        def location_or_default(location):
1✔
147
            default = Location(0, len(problem.sequence), 1)
1✔
148
            return default if location is None else location
1✔
149

150
        location = location_or_default(self.location)
1✔
151
        reference = location_or_default(self.reference)
1✔
152
        return self.copy_with_changes(location=location, reference=reference)
1✔
153

154
    def evaluate(self, problem):
1✔
155
        """Return 0 if the sequence has no repeats, else -number_of_repeats."""
156
        if self.localization_data is not None:
1✔
157
            return self.local_evaluation(problem)
1✔
158
        else:
159
            return self.global_evaluation(problem)
1✔
160

161
    def local_evaluation(self, problem):
1✔
162
        extract_kmer = self.get_kmer_extractor(problem.sequence)
1✔
163
        variable_kmers = {}
1✔
164
        for label in ("location", "extended"):
1✔
165
            variable_kmers[label] = d = {}
1✔
166
            for i in self.localization_data[label]["changing_indices"]:
1✔
167
                kmer = extract_kmer(i)
1✔
168
                if kmer not in d:
1✔
169
                    d[kmer] = [i]
1✔
170
                else:
171
                    d[kmer].append(i)
1✔
172

173
        nonunique_locations = []
1✔
174
        for kmer, indices in variable_kmers["location"].items():
1✔
175
            if len(indices) > 1:
1✔
176
                nonunique_locations += indices
1✔
177
        location_variable_kmers = set(variable_kmers["location"].keys())
1✔
178
        extended_variable_kmers = set(variable_kmers["extended"].keys())
1✔
179
        fixed_location_kmers = self.localization_data["location"]["fixed_kmers"]
1✔
180
        extended_fixed_kmers = self.localization_data["extended"]["fixed_kmers"]
1✔
181

182
        for c in [
1✔
183
            extended_variable_kmers,
184
            fixed_location_kmers,
185
            extended_fixed_kmers,
186
        ]:
187
            nonunique_locations += [
1✔
188
                i
189
                for kmer in location_variable_kmers.intersection(c)
190
                for i in variable_kmers["location"][kmer]
191
            ]
192

193
        for c in [location_variable_kmers, fixed_location_kmers]:
1✔
194
            nonunique_locations += [
1✔
195
                i
196
                for kmer in extended_variable_kmers.intersection(c)
197
                for i in variable_kmers["extended"][kmer]
198
            ]
199
        nonunique_locations = [Location(i, i + self.k) for i in nonunique_locations]
1✔
200
        return SpecEvaluation(
1✔
201
            self,
202
            problem,
203
            score=-len(nonunique_locations),
204
            locations=nonunique_locations,
205
            message="Failed, the following positions are the first occurences"
206
            "of local non-unique segments %s" % nonunique_locations,
207
        )
208

209
    def get_kmer_extractor(self, sequence):
1✔
210
        if self.use_cache:
1✔
211
            getter = get_kmer_extractor_cached
1✔
212
        else:
213
            getter = get_kmer_extractor
1✔
214
        return getter(
1✔
215
            sequence,
216
            k=self.k,
217
            include_reverse_complement=self.include_reverse_complement,
218
        )
219

220
    def global_evaluation(self, problem):
1✔
221
        extract_kmer = self.get_kmer_extractor(problem.sequence)
1✔
222
        kmers_locations = defaultdict(lambda: [])
1✔
223
        start, end = self.reference.start, self.reference.end
1✔
224
        for i in range(start, end - self.k + 1):
1✔
225
            location = (i, i + self.k)
1✔
226
            kmer_sequence = extract_kmer(i)
1✔
227
            kmers_locations[kmer_sequence].append(location)
1✔
228

229
        locations = sorted(
1✔
230
            [
231
                Location(start_, end_)
232
                for locations_list in kmers_locations.values()
233
                for start_, end_ in locations_list
234
                if len(locations_list) > 1
235
                and (self.location.start <= start_ < end_ <= self.location.end)
236
            ],
237
            key=lambda l: l.start,
238
        )
239

240
        if locations == []:
1✔
241
            return SpecEvaluation(
1✔
242
                self,
243
                problem,
244
                score=0,
245
                locations=[],
246
                message="Passed: no nonunique %d-mer found." % self.k,
247
            )
248
        return SpecEvaluation(
1✔
249
            self,
250
            problem,
251
            score=-len(locations),
252
            locations=locations,
253
            message="Failed, the following positions are the first occurences "
254
            "of non-unique segments %s" % locations,
255
        )
256

257
    def localized(self, location, problem=None, with_righthand=True):
1✔
258
        """Localize the evaluation."""
259

260
        if location.overlap_region(self.reference) is None:
1✔
261
            return None
1✔
262
        if problem is None:
1✔
UNCOV
263
            return self
×
264
        extract_kmer = self.get_kmer_extractor(problem.sequence)
1✔
265
        k = self.k
1✔
266
        reference = location.extended(k - 1, right=with_righthand)
1✔
267
        changing_kmers_zone = reference.overlap_region(self.reference)
1✔
268
        changing_kmer_indices = set(changing_kmers_zone.indices[: -k + 1])
1✔
269
        localization_data = {}
1✔
270
        for loc, label in [
1✔
271
            (self.location, "location"),
272
            (self.reference, "extended"),
273
        ]:
274
            kmer_indices = set(loc.indices[: -self.k])
1✔
275
            fixed_kmer_indices = kmer_indices.difference(changing_kmer_indices)
1✔
276
            fixed_kmers = set([extract_kmer(i) for i in fixed_kmer_indices])
1✔
277
            changing_inds = kmer_indices.intersection(changing_kmer_indices)
1✔
278
            localization_data[label] = {
1✔
279
                "fixed_kmers": fixed_kmers,
280
                "changing_indices": changing_inds,
281
            }
282
        localization_data["extended"]["changing_indices"].difference_update(
1✔
283
            localization_data["location"]["changing_indices"]
284
        )
285
        return self.copy_with_changes(
1✔
286
            localization_data=localization_data, location=changing_kmers_zone
287
        )
288

289
    def shifted(self, shift):
1✔
290
        """Shift the location of the specification.
291
        This will also shift the reference.
292
        """
293
        new_location = None if self.location is None else self.location + shift
1✔
294
        reference = None if self.reference is None else self.reference + shift
1✔
295
        return self.copy_with_changes(
1✔
296
            location=new_location,
297
            reference=reference,
298
            derived_from=self,
299
        )
300

301
    def label_parameters(self):
1✔
302
        return [("k", str(self.k))]
1✔
303

304
    def short_label(self):
1✔
305
        return "All %dbp unique" % self.k
1✔
306

307
    def breach_label(self):
1✔
UNCOV
308
        return "%dbp homologies" % self.k
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc