6351839910

Committed 29 Sep 2023 12:52PM UTC coverage: 51.02%. First build

Build # 6351839910

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #1 from jlab/export_python

adding action

Run Details

66 of 66 new or added lines in 2 files covered. (100.0%)

175 of 343 relevant lines covered (51.02%)

0.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.53

/scripts/turnable.py

import pandas as pd
from tqdm import tqdm
import skbio


def read_mutation_candidates(fp_denovos: str, only_pointmutations=True):
    """
    Read trio-WES observed de-novo mutations from file.

    Parameters
    ----------
    fp_denovos : str
        Path to a tabular file with one (de-novo) mutation per row.
        Columns must be:
        1) 'chromosome' name
        2) 'start' position of the mutation (leftmost affected nucleotide)
        3) 'reference' sub-sequence or nucleotide starting from 2)
        4) 'mutation' mutated version of 'reference' sub-sequence or nucleotide
        5) 'qc' a quality control field. Must contain 'PASS' atm (2023-09-29)
        6) 'sample' free text to list the affected sample IDs. Is ignored.
    only_pointmutations : bool
        Listed mutations are typically replacements of individual bases.
        However, insertions or deletions or multiple subsequent nucloetides can
        also be handled. These larger mutations are excluded by default.
        Set only_pointmutations=False if you want to keep them for downstream
        analysis.

    Returns
    -------
    pd.DataFrame. Note that column 'qc' will be dropped. Chromosome names will
    be prefixed by 'chr' if missing in the file.

    Raises
    ------
    a. ValueError if column 'qc' is not PASS in each row
    b. ValueError if 'start' < 0
    c. ValueError if 'reference' or 'mutation' sequence is not of DNA/RNA
       alphabet or gap, see variable ALPHABET
    d. ValueError if multiple rows describe same genomic position
    """
    hits = pd.read_csv(
        fp_denovos, sep="\t", header=None, dtype=str,
        names=['chromosome', 'start', 'reference', 'mutation', 'qc', 'sample'])

    # compute length of reference and mutation region to discriminate between
    # point mutations, insertions, deletions
    for f in ['reference', 'mutation']:
        hits['%s_length' % f] = hits[f].apply(len)

    if hits['qc'].unique() != ['PASS']:
        raise ValueError(
            'Fifth column ("qc") of your file does not contain '
            '"PASS" in rows %s' % ', '.join(hits[hits['qc'] != 'PASS'].index))
    del hits['qc']

    # preprend "chr" to chromosome names if missing
    hits['chromosome'] = hits['chromosome'].apply(
        lambda x: x if x.startswith('chr') else 'chr%s' % x)

    # only focus on point mutations
    if only_pointmutations:
        hits = hits[(hits['reference_length'] == 1) &
                    (hits['mutation_length'] == 1)]

    # convert 'start' to integer
    hits['start'] = hits['start'].astype(int)
    if not all(hits['start'] >= 0):
        raise ValueError("Not all 'start' positions are non-negative!\n%s" %
                         hits[hits['start'] >= 0][['chromosome', 'start']])

    ALPHABET = ['A', 'C', 'G', 'T', 'U', '-', '_']
    for t in ['reference', 'mutation']:
        nonXNA = hits[~hits[t].apply(lambda s: all(map(
            lambda n: n in ALPHABET,
            s.upper())))]
        if nonXNA.shape[0] > 0:
            raise ValueError(
                "You have non-DNA/RNA nucleotides in your %s column!\n%s" % (
                    t,
                    nonXNA[['chromosome', 'start', t]]))

    # assert assumption that every reference position has only one mutated from
    # (not necessarily true as different patients might have different
    # mutations)
    mut_per_pos = hits.groupby(['chromosome', 'start']).size()
    if mut_per_pos.value_counts().shape[0] != 1:
        msg = '\n  '.join(mut_per_pos[mut_per_pos > 1].reset_index().apply(
            lambda row: '%ix %s:%i' % (
                row[0], row['chromosome'], row['start']), axis=1).values)
        raise ValueError("Some of your genomic positions have more then one"
                         " mutation listed:\n  %s" % msg)

    return hits


def overlap_mutations_annotations(mutations: pd.DataFrame,
                                  annotations: pd.DataFrame,
                                  verbose: bool = True):
    """Subsets mutations to those overlapping any annotation
       (at least with one nucleotide).

    Parameters
    ----------
    mutations: pd.DataFrame
        A table that hold information about (de-novo) mutations per row in the
        format of function 'read_mutation_candidates'.
    annotations: pd.DataFrame
        A table parsed from the tabular output of cmsearch, see pyhlofiller's
        easel_table2pd
    verbose: bool
        report progress

    Returns
    -------
    Slice of mutations input, with novel index as one annotation can be hit by
    multiple mutations.
    """
    overlaps = []
    for chrom, gd in tqdm(mutations.groupby('chromosome'),
                          desc='find intersection', disable=not verbose):
        chr_rfam = annotations[annotations['#target name'] == chrom]

        for idx_denovo, d in gd.iterrows():
            h = chr_rfam[((chr_rfam['seq from'] <= d['start']) &
                          (d['start'] <= chr_rfam['seq to'])) |
                         ((chr_rfam['seq from'] >= d['start']) &
                          (d['start'] >= chr_rfam['seq to'])) |
                         ((chr_rfam['seq from'] <=
                           d['start']+d['reference_length']) &
                          (d['start']+d['reference_length'] <=
                           chr_rfam['seq to'])) |
                         ((chr_rfam['seq from'] >=
                           d['start']+d['reference_length']) &
                          (d['start']+d['reference_length'] >=
                           chr_rfam['seq to']))].copy()
            if h.shape[0] > 0:
                for c in d.index:
                    h[c] = d[c]
                overlaps.append(h)
    overlaps = pd.concat(overlaps).reset_index()
    del overlaps['index']
    return overlaps


def extract_reference_subsequences(fp_reference: str, intervals: pd.DataFrame,
                                   verbose=True):
    """Extract sub-sequences for a table of genomic intervals.

    Parameters
    ----------
    fp_reference : str
        Path to a gzipped multiple fasta file, holding the full sequences, e.g.
        the human reference assembly hg38.
    intervals : pd.DataFrame
        Tabular information about genomic intervals to extract. Must have at
        least columns 'seq from', 'seq to', 'strand' and '#target name'.
        The latter values must correspond to fasta header in fp_reference.
    verbose: bool
        report progress

    Returns
    -------
    pd.Series of extracted sub-sequences with same index as 'intervals'.
    """
    res = pd.Series(index=intervals.index, dtype=str)

    # for some reason reading the sequences as DNA fails:
    # .DNA.read(, lowercase=True), i.e. seq lengths is 1
    for seq in tqdm(skbio.io.read(fp_reference, format='fasta'),
                    desc="extract reference sub-sequences",
                    disable=not verbose):
        # genomic positions between slices in skbio: seq[start:end] and tblout
        # from cmsearch have an offset of -1, i.e. we obtain the correct
        # sub-sequence via seq[start-1:end] ...

        # ... same with Layals de-novo positions: pos = seq[pos-1]
        for idx, hit in intervals[intervals['#target name'] ==
                                  seq.metadata['id']].iterrows():
            start, end = hit['seq from'], hit['seq to']
            if hit['strand'] == '-':
                # reverse complement
                start, end = end, start
            subseq = skbio.sequence.DNA(seq[start-1:end], lowercase=True)
            if hit['strand'] == '-':
                subseq = subseq.reverse_complement()
            res.loc[idx] = str(subseq)

    odd = intervals[res.apply(len) != intervals.apply(
        lambda x: abs(x['seq from'] - x['seq to']) + 1, axis=1)]
    if odd.shape[0] > 0:
        raise ValueError("Length of extracted sub-sequences do not match with"
                         " positions from cmsearch for: %s" % odd)

    return res


def mutate_sequence(reference: str, position: int, subseq: str, mutation: str):
    """
    Parameters
    ----------
    reference : str
        The original sequence.
    position : int
        The left position at which a mutation occures (zero based)
    subseq : str
        The sub-string of the reference affected by the mutation.
    mutation : str
        The mutation that shall be integrated at <position> into <reference>

    Returns
    -------
    Two alignment rows (i.e. can contain gap chars '-' in both rows) of
    a) the reference sequence and b) a mutated version.
    """
    if not isinstance(reference, str):
        raise ValueError("reference is not of type str")
    if position >= len(reference):
        raise ValueError("position is larger than your reference")
    if position <= 0:
        raise ValueError("position is < 0")
    if reference[position:position+len(subseq)] != subseq:
        print(reference)
        print(subseq)
        print(position)
        raise ValueError("sub-string '%s' starting at position %i does not "
                         "match your provided subseq '%s'" % (
                            reference[position-2:position+len(subseq)+2],
                            position,
                            subseq))

    prefix = reference[:position]
    suffix = reference[position+len(subseq):]

    aln_orig = prefix + subseq + \
        ('-' * ((len(mutation) - len(subseq)))) + \
        suffix
    aln_mutation = prefix + mutation + \
        ('-' * ((len(subseq) - len(mutation)))) + \
        suffix
    return (aln_orig, aln_mutation)


def create_mutated_sequence(candidates: pd.DataFrame, verbose=True):
    """Obtain aligned original sequence + mutated sequence genomic intervals.

    Parameters
    ----------
    candidates : pd.DataFrame
        List of genomic intervals. Must at least contain columns 'seq from',
        'seq to', 'strand', 'start', 'reference', 'mutation'. See functions
            read_mutation_candidates and pyhlofiller's easel_table2pd
    verbose: bool
        report progress

    Returns
    -------
    pd.DataFrame with same index as candidates with the two columns:
    'aln_reference' and 'aln_mutation'.
    """
    res = pd.DataFrame(index=candidates.index,
                       columns=['aln_reference', 'aln_mutation'],
                       dtype=str)
    for idx, row in tqdm(candidates.iterrows(),
                         desc="generate mutated sequences",
                         disable=not verbose):
        left = row['seq from']
        seq = skbio.sequence.DNA(row['reference_sequence'], lowercase=True)
        if row['strand'] == '-':
            left = row['seq to']
            seq = seq.reverse_complement()
        aln_seq, aln_mut = mutate_sequence(str(seq),
                                           row['start'] - left,
                                           row['reference'],
                                           row['mutation'])
        if row['strand'] == '-':
            aln_seq = str(skbio.sequence.DNA(aln_seq).reverse_complement())
            aln_mut = str(skbio.sequence.DNA(aln_mut).reverse_complement())

        res.loc[idx, 'aln_reference'] = aln_seq
        res.loc[idx, 'aln_mutation'] = aln_mut

    return res

1	import pandas as pd	1✔
2	from tqdm import tqdm	1✔
3	import skbio	1✔
4
5
6	def read_mutation_candidates(fp_denovos: str, only_pointmutations=True):	1✔
7	"""
8	Read trio-WES observed de-novo mutations from file.
9
10	Parameters
11	----------
12	fp_denovos : str
13	Path to a tabular file with one (de-novo) mutation per row.
14	Columns must be:
15	1) 'chromosome' name
16	2) 'start' position of the mutation (leftmost affected nucleotide)
17	3) 'reference' sub-sequence or nucleotide starting from 2)
18	4) 'mutation' mutated version of 'reference' sub-sequence or nucleotide
19	5) 'qc' a quality control field. Must contain 'PASS' atm (2023-09-29)
20	6) 'sample' free text to list the affected sample IDs. Is ignored.
21	only_pointmutations : bool
22	Listed mutations are typically replacements of individual bases.
23	However, insertions or deletions or multiple subsequent nucloetides can
24	also be handled. These larger mutations are excluded by default.
25	Set only_pointmutations=False if you want to keep them for downstream
26	analysis.
27
28	Returns
29	-------
30	pd.DataFrame. Note that column 'qc' will be dropped. Chromosome names will
31	be prefixed by 'chr' if missing in the file.
32
33	Raises
34	------
35	a. ValueError if column 'qc' is not PASS in each row
36	b. ValueError if 'start' < 0
37	c. ValueError if 'reference' or 'mutation' sequence is not of DNA/RNA
38	alphabet or gap, see variable ALPHABET
39	d. ValueError if multiple rows describe same genomic position
40	"""
41	hits = pd.read_csv(	1✔
42	fp_denovos, sep="\t", header=None, dtype=str,
43	names=['chromosome', 'start', 'reference', 'mutation', 'qc', 'sample'])
44
45	# compute length of reference and mutation region to discriminate between
46	# point mutations, insertions, deletions
47	for f in ['reference', 'mutation']:	1✔
48	hits['%s_length' % f] = hits[f].apply(len)	1✔
49
50	if hits['qc'].unique() != ['PASS']:	1✔
51	raise ValueError(	×
52	'Fifth column ("qc") of your file does not contain '
53	'"PASS" in rows %s' % ', '.join(hits[hits['qc'] != 'PASS'].index))
54	del hits['qc']	1✔
55
56	# preprend "chr" to chromosome names if missing
57	hits['chromosome'] = hits['chromosome'].apply(	1✔
58	lambda x: x if x.startswith('chr') else 'chr%s' % x)
59
60	# only focus on point mutations
61	if only_pointmutations:	1✔
62	hits = hits[(hits['reference_length'] == 1) &	1✔
63	(hits['mutation_length'] == 1)]
64
65	# convert 'start' to integer
66	hits['start'] = hits['start'].astype(int)	1✔
67	if not all(hits['start'] >= 0):	1✔
68	raise ValueError("Not all 'start' positions are non-negative!\n%s" %	1✔
69	hits[hits['start'] >= 0][['chromosome', 'start']])
70
71	ALPHABET = ['A', 'C', 'G', 'T', 'U', '-', '_']	1✔
72	for t in ['reference', 'mutation']:	1✔
73	nonXNA = hits[~hits[t].apply(lambda s: all(map(	1✔
74	lambda n: n in ALPHABET,
75	s.upper())))]
76	if nonXNA.shape[0] > 0:	1✔
77	raise ValueError(	1✔
78	"You have non-DNA/RNA nucleotides in your %s column!\n%s" % (
79	t,
80	nonXNA[['chromosome', 'start', t]]))
81
82	# assert assumption that every reference position has only one mutated from
83	# (not necessarily true as different patients might have different
84	# mutations)
85	mut_per_pos = hits.groupby(['chromosome', 'start']).size()	1✔
86	if mut_per_pos.value_counts().shape[0] != 1:	1✔
87	msg = '\n '.join(mut_per_pos[mut_per_pos > 1].reset_index().apply(	1✔
88	lambda row: '%ix %s:%i' % (
89	row[0], row['chromosome'], row['start']), axis=1).values)
90	raise ValueError("Some of your genomic positions have more then one"	1✔
91	" mutation listed:\n %s" % msg)
92
93	return hits	1✔
94
95
96	def overlap_mutations_annotations(mutations: pd.DataFrame,	1✔
97	annotations: pd.DataFrame,
98	verbose: bool = True):
99	"""Subsets mutations to those overlapping any annotation
100	(at least with one nucleotide).
101
102	Parameters
103	----------
104	mutations: pd.DataFrame
105	A table that hold information about (de-novo) mutations per row in the
106	format of function 'read_mutation_candidates'.
107	annotations: pd.DataFrame
108	A table parsed from the tabular output of cmsearch, see pyhlofiller's
109	easel_table2pd
110	verbose: bool
111	report progress
112
113	Returns
114	-------
115	Slice of mutations input, with novel index as one annotation can be hit by
116	multiple mutations.
117	"""
118	overlaps = []	1✔
119	for chrom, gd in tqdm(mutations.groupby('chromosome'),	1✔
120	desc='find intersection', disable=not verbose):
121	chr_rfam = annotations[annotations['#target name'] == chrom]	1✔
122
123	for idx_denovo, d in gd.iterrows():	1✔
124	h = chr_rfam[((chr_rfam['seq from'] <= d['start']) &	1✔
125	(d['start'] <= chr_rfam['seq to'])) \|
126	((chr_rfam['seq from'] >= d['start']) &
127	(d['start'] >= chr_rfam['seq to'])) \|
128	((chr_rfam['seq from'] <=
129	d['start']+d['reference_length']) &
130	(d['start']+d['reference_length'] <=
131	chr_rfam['seq to'])) \|
132	((chr_rfam['seq from'] >=
133	d['start']+d['reference_length']) &
134	(d['start']+d['reference_length'] >=
135	chr_rfam['seq to']))].copy()
136	if h.shape[0] > 0:	1✔
137	for c in d.index:	1✔
138	h[c] = d[c]	1✔
139	overlaps.append(h)	1✔
140	overlaps = pd.concat(overlaps).reset_index()	1✔
141	del overlaps['index']	1✔
142	return overlaps	1✔
143
144
145	def extract_reference_subsequences(fp_reference: str, intervals: pd.DataFrame,	1✔
146	verbose=True):
147	"""Extract sub-sequences for a table of genomic intervals.
148
149	Parameters
150	----------
151	fp_reference : str
152	Path to a gzipped multiple fasta file, holding the full sequences, e.g.
153	the human reference assembly hg38.
154	intervals : pd.DataFrame
155	Tabular information about genomic intervals to extract. Must have at
156	least columns 'seq from', 'seq to', 'strand' and '#target name'.
157	The latter values must correspond to fasta header in fp_reference.
158	verbose: bool
159	report progress
160
161	Returns
162	-------
163	pd.Series of extracted sub-sequences with same index as 'intervals'.
164	"""
165	res = pd.Series(index=intervals.index, dtype=str)	1✔
166
167	# for some reason reading the sequences as DNA fails:
168	# .DNA.read(, lowercase=True), i.e. seq lengths is 1
169	for seq in tqdm(skbio.io.read(fp_reference, format='fasta'),	1✔
170	desc="extract reference sub-sequences",
171	disable=not verbose):
172	# genomic positions between slices in skbio: seq[start:end] and tblout
173	# from cmsearch have an offset of -1, i.e. we obtain the correct
174	# sub-sequence via seq[start-1:end] ...
175
176	# ... same with Layals de-novo positions: pos = seq[pos-1]
177	for idx, hit in intervals[intervals['#target name'] ==	1✔
178	seq.metadata['id']].iterrows():
179	start, end = hit['seq from'], hit['seq to']	1✔
180	if hit['strand'] == '-':	1✔
181	# reverse complement
182	start, end = end, start	1✔
183	subseq = skbio.sequence.DNA(seq[start-1:end], lowercase=True)	1✔
184	if hit['strand'] == '-':	1✔
185	subseq = subseq.reverse_complement()	1✔
186	res.loc[idx] = str(subseq)	1✔
187
188	odd = intervals[res.apply(len) != intervals.apply(	1✔
189	lambda x: abs(x['seq from'] - x['seq to']) + 1, axis=1)]
190	if odd.shape[0] > 0:	1✔
191	raise ValueError("Length of extracted sub-sequences do not match with"	×
192	" positions from cmsearch for: %s" % odd)
193
194	return res	1✔
195
196
197	def mutate_sequence(reference: str, position: int, subseq: str, mutation: str):	1✔
198	"""
199	Parameters
200	----------
201	reference : str
202	The original sequence.
203	position : int
204	The left position at which a mutation occures (zero based)
205	subseq : str
206	The sub-string of the reference affected by the mutation.
207	mutation : str
208	The mutation that shall be integrated at <position> into <reference>
209
210	Returns
211	-------
212	Two alignment rows (i.e. can contain gap chars '-' in both rows) of
213	a) the reference sequence and b) a mutated version.
214	"""
215	if not isinstance(reference, str):	1✔
216	raise ValueError("reference is not of type str")	×
217	if position >= len(reference):	1✔
218	raise ValueError("position is larger than your reference")	×
219	if position <= 0:	1✔
220	raise ValueError("position is < 0")	×
221	if reference[position:position+len(subseq)] != subseq:	1✔
222	print(reference)	×
223	print(subseq)	×
224	print(position)	×
225	raise ValueError("sub-string '%s' starting at position %i does not "	×
226	"match your provided subseq '%s'" % (
227	reference[position-2:position+len(subseq)+2],
228	position,
229	subseq))
230
231	prefix = reference[:position]	1✔
232	suffix = reference[position+len(subseq):]	1✔
233
234	aln_orig = prefix + subseq + \	1✔
235	('-' * ((len(mutation) - len(subseq)))) + \
236	suffix
237	aln_mutation = prefix + mutation + \	1✔
238	('-' * ((len(subseq) - len(mutation)))) + \
239	suffix
240	return (aln_orig, aln_mutation)	1✔
241
242
243	def create_mutated_sequence(candidates: pd.DataFrame, verbose=True):	1✔
244	"""Obtain aligned original sequence + mutated sequence genomic intervals.
245
246	Parameters
247	----------
248	candidates : pd.DataFrame
249	List of genomic intervals. Must at least contain columns 'seq from',
250	'seq to', 'strand', 'start', 'reference', 'mutation'. See functions
251	read_mutation_candidates and pyhlofiller's easel_table2pd
252	verbose: bool
253	report progress
254
255	Returns
256	-------
257	pd.DataFrame with same index as candidates with the two columns:
258	'aln_reference' and 'aln_mutation'.
259	"""
260	res = pd.DataFrame(index=candidates.index,	1✔
261	columns=['aln_reference', 'aln_mutation'],
262	dtype=str)
263	for idx, row in tqdm(candidates.iterrows(),	1✔
264	desc="generate mutated sequences",
265	disable=not verbose):
266	left = row['seq from']	1✔
267	seq = skbio.sequence.DNA(row['reference_sequence'], lowercase=True)	1✔
268	if row['strand'] == '-':	1✔
269	left = row['seq to']	1✔
270	seq = seq.reverse_complement()	1✔
271	aln_seq, aln_mut = mutate_sequence(str(seq),	1✔
272	row['start'] - left,
273	row['reference'],
274	row['mutation'])
275	if row['strand'] == '-':	1✔
276	aln_seq = str(skbio.sequence.DNA(aln_seq).reverse_complement())	1✔
277	aln_mut = str(skbio.sequence.DNA(aln_mut).reverse_complement())	1✔
278
279	res.loc[idx, 'aln_reference'] = aln_seq	1✔
280	res.loc[idx, 'aln_mutation'] = aln_mut	1✔
281
282	return res	1✔

jlab / workflow_tuRNAble / 6351839910

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous