7894856040

Committed 14 Feb 2024 12:52AM UTC coverage: 46.404% (-5.7%) from 52.085%

Build # 7894856040

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #38 from js51/SplitP-rewrite

Re-organise modules

Run Details

403 of 880 new or added lines in 12 files covered. (45.8%)

413 of 890 relevant lines covered (46.4%)

1.39 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.13

/splitp/constructions.py

import numpy as np
from splitp.enums import FlatFormat
from scipy.sparse import dok_matrix, coo_matrix
import splitp.constants as constants


def flattening(
    split,
    pattern_probabilities,
    flattening_format=FlatFormat.sparse
):
    """
    Compute the flattening of a split given a pattern probability dictionary.

    Args:
        split (str or list): The split to compute the flattening of.
        pattern_probabilities (dict): A dictionary of pattern probabilities.
        flattening_format (FlatFormat): The format to return the flattening in.

    Returns:
        The flattening of the split in the specified format.
    """
    if isinstance(split, str):
        split = split.split("|")
    taxa = sorted(set(split[0]) | set(split[1]))
    if flattening_format is FlatFormat.sparse:
        return __sparse_flattening(split, pattern_probabilities, taxa)
    if flattening_format is FlatFormat.reduced:
        return __reduced_flattening(split, pattern_probabilities, taxa)


def __reduced_flattening(split, pattern_probabilities, taxa):
    if isinstance(split, str):
        split = split.split("|")
    flattening_data = {}
    used_cols = set()
    taxa_indexer = {taxon: i for i, taxon in enumerate(taxa)}
    for r in pattern_probabilities.items():
        pattern = r[0]
        row = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[0]]))
        col = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[1]]))
        used_cols.add(col)
        try:
            flattening_data[row][col] = r[1]
        except KeyError:
            flattening_data[row] = {col: r[1]}
    column_sort_order = {}

    for i, used_col in enumerate(sorted(used_cols)):
        column_sort_order[used_col] = i

    flattening = np.zeros((len(flattening_data), len(used_cols)))
    for i, (row_index, column_data) in enumerate(sorted(flattening_data.items())):
        for col_index, prob in column_data.items():
            flattening[i, column_sort_order[col_index]] = prob
    return flattening


def __sparse_flattening(split, pattern_probabilities, taxa):
    format = "dok" # Temporary hard-coded choice
    if isinstance(split, str):
        split = split.split("|")
    taxa_indexer = {taxon: i for i, taxon in enumerate(taxa)}
    if format == "coo":
        rows = []
        cols = []
        data = []
        for r in pattern_probabilities.items():
            if r[1] != 0:
                pattern = r[0]
                row = __index_of(
                    "".join([str(pattern[taxa_indexer[s]]) for s in split[0]])
                )
                col = __index_of(
                    "".join([str(pattern[taxa_indexer[s]]) for s in split[1]])
                )
                rows.append(row)
                cols.append(col)
                data.append(r[1])
        return coo_matrix(
            (data, (rows, cols)), shape=(4 ** len(split[0]), 4 ** len(split[1]))
        )
    elif format == "dok":
        flattening = dok_matrix((4 ** len(split[0]), 4 ** len(split[1])))
        for r in pattern_probabilities.items():
            pattern = r[0]
            row = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[0]]))
            col = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[1]]))
            flattening[row, col] = r[1]
        return flattening


def subflattening(split, pattern_probabilities, data=None):
    """
    A faster version of signed sum subflattening. Requires a data dictionary and can be supplied with a bundle of
    re-usable information to reduce the number of calls to the multiplications function.
    """
    state_space = constants.DNA_state_space
    if data is None:
        data = {}
    try:
        coeffs = data["coeffs"]
        labels = data["labels"]
    except KeyError:
        data["coeffs"] = coeffs = {}
        data["labels"] = labels = {}

    if isinstance(split, str):
        split = split.split("|")
    sp1, sp2 = len(split[0]), len(split[1])
    subflattening = [[0 for _ in range(3 * sp2 + 1)] for _ in range(3 * sp1 + 1)]
    try:
        row_labels = labels[sp1]
    except KeyError:
        row_labels = list(__subflattening_labels_generator(sp1))
        labels[sp1] = row_labels
    try:
        col_labels = labels[sp2]
    except KeyError:
        col_labels = list(__subflattening_labels_generator(sp2))
        labels[sp2] = col_labels
    banned = (
        {("C", "C"), ("G", "G"), ("A", "T")}
        | {(x, "A") for x in state_space}
        | {("T", x) for x in state_space}
    )
    for r, row in enumerate(row_labels):
        for c, col in enumerate(col_labels):
            pattern = __reconstruct_pattern(split, row, col)
            signed_sum = 0
            for table_pattern, value in pattern_probabilities.items():
                try:
                    product = coeffs[(pattern, table_pattern)]
                except KeyError:
                    product = 1
                    for t in zip(pattern, table_pattern):
                        if t not in banned:
                            product *= -1
                    coeffs[(pattern, table_pattern)] = product
                signed_sum += product * value
            subflattening[r][c] = signed_sum
    return np.array(subflattening)


def __index_of(string):
    string = reversed(string)
    index = 0
    for o, s in enumerate(string):
        index += (4**o) * constants.DNA_state_space_dict[s]
    return index


def __subflattening_labels_generator(length):
    n = length
    state_space = constants.DNA_state_space
    other_states = state_space[0:-1]
    special_state = state_space[-1]
    templates = (
        (
            "".join("T" for _ in range(i)),
            "".join("T" for _ in range(n - i - 1)),
        )
        for i in range(n)
    )
    for template in templates:
        for c in other_states:
            yield f"{template[0]}{c}{template[1]}"
    yield "".join(special_state for _ in range(n))


def __reconstruct_pattern(split, row_label, col_label):
    n = len(split[0]) + len(split[1])
    pattern = {}
    for splindex, loc in enumerate(split[0]):
        pattern[int(str(loc), n) if len(str(loc)) == 1 else int(str(loc)[1:])] = row_label[splindex]
    for splindex, loc in enumerate(split[1]):
        pattern[int(str(loc), n) if len(str(loc)) == 1 else int(str(loc)[1:])] = col_label[splindex]
    return "".join(pattern[i] for i in range(n))

1	import numpy as np	3✔
2	from splitp.enums import FlatFormat	3✔
3	from scipy.sparse import dok_matrix, coo_matrix	3✔
4	import splitp.constants as constants	3✔
5
6
7	def flattening(	3✔
8	split,
9	pattern_probabilities,
10	flattening_format=FlatFormat.sparse
11	):
12	"""
13	Compute the flattening of a split given a pattern probability dictionary.
14
15	Args:
16	split (str or list): The split to compute the flattening of.
17	pattern_probabilities (dict): A dictionary of pattern probabilities.
18	flattening_format (FlatFormat): The format to return the flattening in.
19
20	Returns:
21	The flattening of the split in the specified format.
22	"""
23	if isinstance(split, str):	3✔
NEW 24	split = split.split("\|")	×
25	taxa = sorted(set(split[0]) \| set(split[1]))	3✔
26	if flattening_format is FlatFormat.sparse:	3✔
27	return __sparse_flattening(split, pattern_probabilities, taxa)	3✔
28	if flattening_format is FlatFormat.reduced:	3✔
29	return __reduced_flattening(split, pattern_probabilities, taxa)	3✔
30
31
32	def __reduced_flattening(split, pattern_probabilities, taxa):	3✔
33	if isinstance(split, str):	3✔
NEW 34	split = split.split("\|")	×
35	flattening_data = {}	3✔
36	used_cols = set()	3✔
37	taxa_indexer = {taxon: i for i, taxon in enumerate(taxa)}	3✔
38	for r in pattern_probabilities.items():	3✔
39	pattern = r[0]	3✔
40	row = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[0]]))	3✔
41	col = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[1]]))	3✔
42	used_cols.add(col)	3✔
43	try:	3✔
44	flattening_data[row][col] = r[1]	3✔
45	except KeyError:	3✔
46	flattening_data[row] = {col: r[1]}	3✔
47	column_sort_order = {}	3✔
48
49	for i, used_col in enumerate(sorted(used_cols)):	3✔
50	column_sort_order[used_col] = i	3✔
51
52	flattening = np.zeros((len(flattening_data), len(used_cols)))	3✔
53	for i, (row_index, column_data) in enumerate(sorted(flattening_data.items())):	3✔
54	for col_index, prob in column_data.items():	3✔
55	flattening[i, column_sort_order[col_index]] = prob	3✔
56	return flattening	3✔
57
58
59	def __sparse_flattening(split, pattern_probabilities, taxa):	3✔
60	format = "dok" # Temporary hard-coded choice	3✔
61	if isinstance(split, str):	3✔
NEW 62	split = split.split("\|")	×
63	taxa_indexer = {taxon: i for i, taxon in enumerate(taxa)}	3✔
64	if format == "coo":	3✔
NEW 65	rows = []	×
NEW 66	cols = []	×
NEW 67	data = []	×
NEW 68	for r in pattern_probabilities.items():	×
NEW 69	if r[1] != 0:	×
NEW 70	pattern = r[0]	×
NEW 71	row = __index_of(	×
72	"".join([str(pattern[taxa_indexer[s]]) for s in split[0]])
73	)
NEW 74	col = __index_of(	×
75	"".join([str(pattern[taxa_indexer[s]]) for s in split[1]])
76	)
NEW 77	rows.append(row)	×
NEW 78	cols.append(col)	×
NEW 79	data.append(r[1])	×
NEW 80	return coo_matrix(	×
81	(data, (rows, cols)), shape=(4 len(split[0]), 4 len(split[1]))
82	)
83	elif format == "dok":	3✔
84	flattening = dok_matrix((4 len(split[0]), 4 len(split[1])))	3✔
85	for r in pattern_probabilities.items():	3✔
86	pattern = r[0]	3✔
87	row = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[0]]))	3✔
88	col = __index_of("".join([str(pattern[taxa_indexer[s]]) for s in split[1]]))	3✔
89	flattening[row, col] = r[1]	3✔
90	return flattening	3✔
91
92
93	def subflattening(split, pattern_probabilities, data=None):	3✔
94	"""
95	A faster version of signed sum subflattening. Requires a data dictionary and can be supplied with a bundle of
96	re-usable information to reduce the number of calls to the multiplications function.
97	"""
98	state_space = constants.DNA_state_space	3✔
99	if data is None:	3✔
100	data = {}	3✔
101	try:	3✔
102	coeffs = data["coeffs"]	3✔
NEW 103	labels = data["labels"]	×
104	except KeyError:	3✔
105	data["coeffs"] = coeffs = {}	3✔
106	data["labels"] = labels = {}	3✔
107
108	if isinstance(split, str):	3✔
NEW 109	split = split.split("\|")	×
110	sp1, sp2 = len(split[0]), len(split[1])	3✔
111	subflattening = [[0 for _ in range(3 * sp2 + 1)] for _ in range(3 * sp1 + 1)]	3✔
112	try:	3✔
113	row_labels = labels[sp1]	3✔
114	except KeyError:	3✔
115	row_labels = list(__subflattening_labels_generator(sp1))	3✔
116	labels[sp1] = row_labels	3✔
117	try:	3✔
118	col_labels = labels[sp2]	3✔
NEW 119	except KeyError:	×
NEW 120	col_labels = list(__subflattening_labels_generator(sp2))	×
NEW 121	labels[sp2] = col_labels	×
122	banned = (	3✔
123	{("C", "C"), ("G", "G"), ("A", "T")}
124	\| {(x, "A") for x in state_space}
125	\| {("T", x) for x in state_space}
126	)
127	for r, row in enumerate(row_labels):	3✔
128	for c, col in enumerate(col_labels):	3✔
129	pattern = __reconstruct_pattern(split, row, col)	3✔
130	signed_sum = 0	3✔
131	for table_pattern, value in pattern_probabilities.items():	3✔
132	try:	3✔
133	product = coeffs[(pattern, table_pattern)]	3✔
134	except KeyError:	3✔
135	product = 1	3✔
136	for t in zip(pattern, table_pattern):	3✔
137	if t not in banned:	3✔
138	product *= -1	3✔
139	coeffs[(pattern, table_pattern)] = product	3✔
140	signed_sum += product * value	3✔
141	subflattening[r][c] = signed_sum	3✔
142	return np.array(subflattening)	3✔
143
144
145	def __index_of(string):	3✔
146	string = reversed(string)	3✔
147	index = 0	3✔
148	for o, s in enumerate(string):	3✔
149	index += (4*o) constants.DNA_state_space_dict[s]	3✔
150	return index	3✔
151
152
153	def __subflattening_labels_generator(length):	3✔
154	n = length	3✔
155	state_space = constants.DNA_state_space	3✔
156	other_states = state_space[0:-1]	3✔
157	special_state = state_space[-1]	3✔
158	templates = (	3✔
159	(
160	"".join("T" for _ in range(i)),
161	"".join("T" for _ in range(n - i - 1)),
162	)
163	for i in range(n)
164	)
165	for template in templates:	3✔
166	for c in other_states:	3✔
167	yield f"{template[0]}{c}{template[1]}"	3✔
168	yield "".join(special_state for _ in range(n))	3✔
169
170
171	def __reconstruct_pattern(split, row_label, col_label):	3✔
172	n = len(split[0]) + len(split[1])	3✔
173	pattern = {}	3✔
174	for splindex, loc in enumerate(split[0]):	3✔
175	pattern[int(str(loc), n) if len(str(loc)) == 1 else int(str(loc)[1:])] = row_label[splindex]	3✔
176	for splindex, loc in enumerate(split[1]):	3✔
177	pattern[int(str(loc), n) if len(str(loc)) == 1 else int(str(loc)[1:])] = col_label[splindex]	3✔
178	return "".join(pattern[i] for i in range(n))	3✔

js51 / SplitP / 7894856040

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous