• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / workflow_tuRNAble / 6351839910

29 Sep 2023 12:52PM UTC coverage: 51.02%. First build
6351839910

push

github

web-flow
Merge pull request #1 from jlab/export_python

adding action

66 of 66 new or added lines in 2 files covered. (100.0%)

175 of 343 relevant lines covered (51.02%)

0.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.53
/scripts/turnable.py
1
import pandas as pd
1✔
2
from tqdm import tqdm
1✔
3
import skbio
1✔
4

5

6
def read_mutation_candidates(fp_denovos: str, only_pointmutations=True):
1✔
7
    """
8
    Read trio-WES observed de-novo mutations from file.
9

10
    Parameters
11
    ----------
12
    fp_denovos : str
13
        Path to a tabular file with one (de-novo) mutation per row.
14
        Columns must be:
15
        1) 'chromosome' name
16
        2) 'start' position of the mutation (leftmost affected nucleotide)
17
        3) 'reference' sub-sequence or nucleotide starting from 2)
18
        4) 'mutation' mutated version of 'reference' sub-sequence or nucleotide
19
        5) 'qc' a quality control field. Must contain 'PASS' atm (2023-09-29)
20
        6) 'sample' free text to list the affected sample IDs. Is ignored.
21
    only_pointmutations : bool
22
        Listed mutations are typically replacements of individual bases.
23
        However, insertions or deletions or multiple subsequent nucloetides can
24
        also be handled. These larger mutations are excluded by default.
25
        Set only_pointmutations=False if you want to keep them for downstream
26
        analysis.
27

28
    Returns
29
    -------
30
    pd.DataFrame. Note that column 'qc' will be dropped. Chromosome names will
31
    be prefixed by 'chr' if missing in the file.
32

33
    Raises
34
    ------
35
    a. ValueError if column 'qc' is not PASS in each row
36
    b. ValueError if 'start' < 0
37
    c. ValueError if 'reference' or 'mutation' sequence is not of DNA/RNA
38
       alphabet or gap, see variable ALPHABET
39
    d. ValueError if multiple rows describe same genomic position
40
    """
41
    hits = pd.read_csv(
1✔
42
        fp_denovos, sep="\t", header=None, dtype=str,
43
        names=['chromosome', 'start', 'reference', 'mutation', 'qc', 'sample'])
44

45
    # compute length of reference and mutation region to discriminate between
46
    # point mutations, insertions, deletions
47
    for f in ['reference', 'mutation']:
1✔
48
        hits['%s_length' % f] = hits[f].apply(len)
1✔
49

50
    if hits['qc'].unique() != ['PASS']:
1✔
51
        raise ValueError(
×
52
            'Fifth column ("qc") of your file does not contain '
53
            '"PASS" in rows %s' % ', '.join(hits[hits['qc'] != 'PASS'].index))
54
    del hits['qc']
1✔
55

56
    # preprend "chr" to chromosome names if missing
57
    hits['chromosome'] = hits['chromosome'].apply(
1✔
58
        lambda x: x if x.startswith('chr') else 'chr%s' % x)
59

60
    # only focus on point mutations
61
    if only_pointmutations:
1✔
62
        hits = hits[(hits['reference_length'] == 1) &
1✔
63
                    (hits['mutation_length'] == 1)]
64

65
    # convert 'start' to integer
66
    hits['start'] = hits['start'].astype(int)
1✔
67
    if not all(hits['start'] >= 0):
1✔
68
        raise ValueError("Not all 'start' positions are non-negative!\n%s" %
1✔
69
                         hits[hits['start'] >= 0][['chromosome', 'start']])
70

71
    ALPHABET = ['A', 'C', 'G', 'T', 'U', '-', '_']
1✔
72
    for t in ['reference', 'mutation']:
1✔
73
        nonXNA = hits[~hits[t].apply(lambda s: all(map(
1✔
74
            lambda n: n in ALPHABET,
75
            s.upper())))]
76
        if nonXNA.shape[0] > 0:
1✔
77
            raise ValueError(
1✔
78
                "You have non-DNA/RNA nucleotides in your %s column!\n%s" % (
79
                    t,
80
                    nonXNA[['chromosome', 'start', t]]))
81

82
    # assert assumption that every reference position has only one mutated from
83
    # (not necessarily true as different patients might have different
84
    # mutations)
85
    mut_per_pos = hits.groupby(['chromosome', 'start']).size()
1✔
86
    if mut_per_pos.value_counts().shape[0] != 1:
1✔
87
        msg = '\n  '.join(mut_per_pos[mut_per_pos > 1].reset_index().apply(
1✔
88
            lambda row: '%ix %s:%i' % (
89
                row[0], row['chromosome'], row['start']), axis=1).values)
90
        raise ValueError("Some of your genomic positions have more then one"
1✔
91
                         " mutation listed:\n  %s" % msg)
92

93
    return hits
1✔
94

95

96
def overlap_mutations_annotations(mutations: pd.DataFrame,
1✔
97
                                  annotations: pd.DataFrame,
98
                                  verbose: bool = True):
99
    """Subsets mutations to those overlapping any annotation
100
       (at least with one nucleotide).
101

102
    Parameters
103
    ----------
104
    mutations: pd.DataFrame
105
        A table that hold information about (de-novo) mutations per row in the
106
        format of function 'read_mutation_candidates'.
107
    annotations: pd.DataFrame
108
        A table parsed from the tabular output of cmsearch, see pyhlofiller's
109
        easel_table2pd
110
    verbose: bool
111
        report progress
112

113
    Returns
114
    -------
115
    Slice of mutations input, with novel index as one annotation can be hit by
116
    multiple mutations.
117
    """
118
    overlaps = []
1✔
119
    for chrom, gd in tqdm(mutations.groupby('chromosome'),
1✔
120
                          desc='find intersection', disable=not verbose):
121
        chr_rfam = annotations[annotations['#target name'] == chrom]
1✔
122

123
        for idx_denovo, d in gd.iterrows():
1✔
124
            h = chr_rfam[((chr_rfam['seq from'] <= d['start']) &
1✔
125
                          (d['start'] <= chr_rfam['seq to'])) |
126
                         ((chr_rfam['seq from'] >= d['start']) &
127
                          (d['start'] >= chr_rfam['seq to'])) |
128
                         ((chr_rfam['seq from'] <=
129
                           d['start']+d['reference_length']) &
130
                          (d['start']+d['reference_length'] <=
131
                           chr_rfam['seq to'])) |
132
                         ((chr_rfam['seq from'] >=
133
                           d['start']+d['reference_length']) &
134
                          (d['start']+d['reference_length'] >=
135
                           chr_rfam['seq to']))].copy()
136
            if h.shape[0] > 0:
1✔
137
                for c in d.index:
1✔
138
                    h[c] = d[c]
1✔
139
                overlaps.append(h)
1✔
140
    overlaps = pd.concat(overlaps).reset_index()
1✔
141
    del overlaps['index']
1✔
142
    return overlaps
1✔
143

144

145
def extract_reference_subsequences(fp_reference: str, intervals: pd.DataFrame,
1✔
146
                                   verbose=True):
147
    """Extract sub-sequences for a table of genomic intervals.
148

149
    Parameters
150
    ----------
151
    fp_reference : str
152
        Path to a gzipped multiple fasta file, holding the full sequences, e.g.
153
        the human reference assembly hg38.
154
    intervals : pd.DataFrame
155
        Tabular information about genomic intervals to extract. Must have at
156
        least columns 'seq from', 'seq to', 'strand' and '#target name'.
157
        The latter values must correspond to fasta header in fp_reference.
158
    verbose: bool
159
        report progress
160

161
    Returns
162
    -------
163
    pd.Series of extracted sub-sequences with same index as 'intervals'.
164
    """
165
    res = pd.Series(index=intervals.index, dtype=str)
1✔
166

167
    # for some reason reading the sequences as DNA fails:
168
    # .DNA.read(, lowercase=True), i.e. seq lengths is 1
169
    for seq in tqdm(skbio.io.read(fp_reference, format='fasta'),
1✔
170
                    desc="extract reference sub-sequences",
171
                    disable=not verbose):
172
        # genomic positions between slices in skbio: seq[start:end] and tblout
173
        # from cmsearch have an offset of -1, i.e. we obtain the correct
174
        # sub-sequence via seq[start-1:end] ...
175

176
        # ... same with Layals de-novo positions: pos = seq[pos-1]
177
        for idx, hit in intervals[intervals['#target name'] ==
1✔
178
                                  seq.metadata['id']].iterrows():
179
            start, end = hit['seq from'], hit['seq to']
1✔
180
            if hit['strand'] == '-':
1✔
181
                # reverse complement
182
                start, end = end, start
1✔
183
            subseq = skbio.sequence.DNA(seq[start-1:end], lowercase=True)
1✔
184
            if hit['strand'] == '-':
1✔
185
                subseq = subseq.reverse_complement()
1✔
186
            res.loc[idx] = str(subseq)
1✔
187

188
    odd = intervals[res.apply(len) != intervals.apply(
1✔
189
        lambda x: abs(x['seq from'] - x['seq to']) + 1, axis=1)]
190
    if odd.shape[0] > 0:
1✔
191
        raise ValueError("Length of extracted sub-sequences do not match with"
×
192
                         " positions from cmsearch for: %s" % odd)
193

194
    return res
1✔
195

196

197
def mutate_sequence(reference: str, position: int, subseq: str, mutation: str):
1✔
198
    """
199
    Parameters
200
    ----------
201
    reference : str
202
        The original sequence.
203
    position : int
204
        The left position at which a mutation occures (zero based)
205
    subseq : str
206
        The sub-string of the reference affected by the mutation.
207
    mutation : str
208
        The mutation that shall be integrated at <position> into <reference>
209

210
    Returns
211
    -------
212
    Two alignment rows (i.e. can contain gap chars '-' in both rows) of
213
    a) the reference sequence and b) a mutated version.
214
    """
215
    if not isinstance(reference, str):
1✔
216
        raise ValueError("reference is not of type str")
×
217
    if position >= len(reference):
1✔
218
        raise ValueError("position is larger than your reference")
×
219
    if position <= 0:
1✔
220
        raise ValueError("position is < 0")
×
221
    if reference[position:position+len(subseq)] != subseq:
1✔
222
        print(reference)
×
223
        print(subseq)
×
224
        print(position)
×
225
        raise ValueError("sub-string '%s' starting at position %i does not "
×
226
                         "match your provided subseq '%s'" % (
227
                            reference[position-2:position+len(subseq)+2],
228
                            position,
229
                            subseq))
230

231
    prefix = reference[:position]
1✔
232
    suffix = reference[position+len(subseq):]
1✔
233

234
    aln_orig = prefix + subseq + \
1✔
235
        ('-' * ((len(mutation) - len(subseq)))) + \
236
        suffix
237
    aln_mutation = prefix + mutation + \
1✔
238
        ('-' * ((len(subseq) - len(mutation)))) + \
239
        suffix
240
    return (aln_orig, aln_mutation)
1✔
241

242

243
def create_mutated_sequence(candidates: pd.DataFrame, verbose=True):
1✔
244
    """Obtain aligned original sequence + mutated sequence genomic intervals.
245

246
    Parameters
247
    ----------
248
    candidates : pd.DataFrame
249
        List of genomic intervals. Must at least contain columns 'seq from',
250
        'seq to', 'strand', 'start', 'reference', 'mutation'. See functions
251
            read_mutation_candidates and pyhlofiller's easel_table2pd
252
    verbose: bool
253
        report progress
254

255
    Returns
256
    -------
257
    pd.DataFrame with same index as candidates with the two columns:
258
    'aln_reference' and 'aln_mutation'.
259
    """
260
    res = pd.DataFrame(index=candidates.index,
1✔
261
                       columns=['aln_reference', 'aln_mutation'],
262
                       dtype=str)
263
    for idx, row in tqdm(candidates.iterrows(),
1✔
264
                         desc="generate mutated sequences",
265
                         disable=not verbose):
266
        left = row['seq from']
1✔
267
        seq = skbio.sequence.DNA(row['reference_sequence'], lowercase=True)
1✔
268
        if row['strand'] == '-':
1✔
269
            left = row['seq to']
1✔
270
            seq = seq.reverse_complement()
1✔
271
        aln_seq, aln_mut = mutate_sequence(str(seq),
1✔
272
                                           row['start'] - left,
273
                                           row['reference'],
274
                                           row['mutation'])
275
        if row['strand'] == '-':
1✔
276
            aln_seq = str(skbio.sequence.DNA(aln_seq).reverse_complement())
1✔
277
            aln_mut = str(skbio.sequence.DNA(aln_mut).reverse_complement())
1✔
278

279
        res.loc[idx, 'aln_reference'] = aln_seq
1✔
280
        res.loc[idx, 'aln_mutation'] = aln_mut
1✔
281

282
    return res
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc