7342943552

Committed 28 Dec 2023 12:13AM UTC coverage: 83.549% (-14.0%) from 97.551%

Build # 7342943552

Build Type

push

github

Committed by

web-flow

Commit Message

Fix warning due to styling dtyles, and fix column dtype test failure (#238)


Fixes #225

Run Details

6 of 6 new or added lines in 2 files covered. (100.0%)

312 existing lines in 7 files now uncovered.

1681 of 2012 relevant lines covered (83.55%)

1.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

52.7

/upsetplot/reformat.py

from __future__ import print_function, division, absolute_import

try:
    import typing
except ImportError:
    import collections as typing

import numpy as np
import pandas as pd


def _aggregate_data(df, subset_size, sum_over):
    """
    Returns
    -------
    df : DataFrame
        full data frame
    aggregated : Series
        aggregates
    """
    _SUBSET_SIZE_VALUES = ['auto', 'count', 'sum']
    if subset_size not in _SUBSET_SIZE_VALUES:
        raise ValueError('subset_size should be one of %s. Got %r'
                         % (_SUBSET_SIZE_VALUES, subset_size))
    if df.ndim == 1:
        # Series
        input_name = df.name
        df = pd.DataFrame({'_value': df})

        if subset_size == 'auto' and not df.index.is_unique:
            raise ValueError('subset_size="auto" cannot be used for a '
                             'Series with non-unique groups.')
        if sum_over is not None:
            raise ValueError('sum_over is not applicable when the input is a '
                             'Series')
        if subset_size == 'count':
            sum_over = False
        else:
            sum_over = '_value'
    else:
        # DataFrame
        if sum_over is False:
            raise ValueError('Unsupported value for sum_over: False')
        elif subset_size == 'auto' and sum_over is None:
            sum_over = False
        elif subset_size == 'count':
            if sum_over is not None:
                raise ValueError('sum_over cannot be set if subset_size=%r' %
                                 subset_size)
            sum_over = False
        elif subset_size == 'sum':
            if sum_over is None:
                raise ValueError('sum_over should be a field name if '
                                 'subset_size="sum" and a DataFrame is '
                                 'provided.')

    gb = df.groupby(level=list(range(df.index.nlevels)), sort=False)
    if sum_over is False:
        aggregated = gb.size()
        aggregated.name = 'size'
    elif hasattr(sum_over, 'lower'):
        aggregated = gb[sum_over].sum()
    else:
        raise ValueError('Unsupported value for sum_over: %r' % sum_over)

    if aggregated.name == '_value':
        aggregated.name = input_name

    return df, aggregated


def _check_index(df):
    # check all indices are boolean
    if not all(set([True, False]) >= set(level)
               for level in df.index.levels):
        raise ValueError('The DataFrame has values in its index that are not '
                         'boolean')
    df = df.copy(deep=False)
    # XXX: this may break if input is not MultiIndex
    kw = {'levels': [x.astype(bool) for x in df.index.levels],
          'names': df.index.names,
          }
    if hasattr(df.index, 'codes'):
        # compat for pandas <= 0.20
        kw['codes'] = df.index.codes
    else:
        kw['labels'] = df.index.labels
    df.index = pd.MultiIndex(**kw)
    return df


def _scalar_to_list(val):
    if not isinstance(val, (typing.Sequence, set)) or isinstance(val, str):
        val = [val]
    return val


def _get_subset_mask(agg, min_subset_size, max_subset_size,
                     min_degree, max_degree,
                     present, absent):
    """Get a mask over subsets based on size, degree or category presence"""
    subset_mask = True
    if min_subset_size is not None:
        subset_mask = np.logical_and(subset_mask, agg >= min_subset_size)
    if max_subset_size is not None:
        subset_mask = np.logical_and(subset_mask, agg <= max_subset_size)
    if (min_degree is not None and min_degree >= 0) or max_degree is not None:
        degree = agg.index.to_frame().sum(axis=1)
        if min_degree is not None:
            subset_mask = np.logical_and(subset_mask, degree >= min_degree)
        if max_degree is not None:
            subset_mask = np.logical_and(subset_mask, degree <= max_degree)
    if present is not None:
        for col in _scalar_to_list(present):
            subset_mask = np.logical_and(
                subset_mask,
                agg.index.get_level_values(col).values)
    if absent is not None:
        for col in _scalar_to_list(absent):
            exclude_mask = np.logical_not(
                agg.index.get_level_values(col).values)
            subset_mask = np.logical_and(subset_mask, exclude_mask)
    return subset_mask


def _filter_subsets(df, agg,
                    min_subset_size, max_subset_size,
                    min_degree, max_degree,
                    present, absent):
    subset_mask = _get_subset_mask(agg,
                                   min_subset_size=min_subset_size,
                                   max_subset_size=max_subset_size,
                                   min_degree=min_degree,
                                   max_degree=max_degree,
                                   present=present, absent=absent)

    if subset_mask is True:
        return df, agg

    agg = agg[subset_mask]
    df = df[df.index.isin(agg.index)]
    return df, agg


class QueryResult:
    """Container for reformatted data and aggregates

    Attributes
    ----------
    data : DataFrame
        Selected samples. The index is a MultiIndex with one boolean level for
        each category.
    subsets : dict[frozenset, DataFrame]
        Dataframes for each intersection of categories.
    subset_sizes : Series
        Total size of each selected subset as a series. The index is as
        for `data`.
    category_totals : Series
        Total size of each category, regardless of selection.
    """
    def __init__(self, data, subset_sizes, category_totals):
        self.data = data
        self.subset_sizes = subset_sizes
        self.category_totals = category_totals

    def __repr__(self):
        return ("QueryResult(data={data}, subset_sizes={subset_sizes}, "
                "category_totals={category_totals}".format(**vars(self)))

    @property
    def subsets(self):
        categories = np.asarray(self.data.index.names)
        return {
            frozenset(categories.take(mask)): subset_data
            for mask, subset_data
            in self.data.groupby(level=list(range(len(categories))),
                                 sort=False)
        }


def query(data, present=None, absent=None,
          min_subset_size=None, max_subset_size=None,
          min_degree=None, max_degree=None,
          sort_by='degree', sort_categories_by='cardinality',
          subset_size='auto', sum_over=None, include_empty_subsets=False):
    """Transform and filter a categorised dataset

    Retrieve the set of items and totals corresponding to subsets of interest.

    Parameters
    ----------
    data : pandas.Series or pandas.DataFrame
        Elements associated with categories (a DataFrame), or the size of each
        subset of categories (a Series).
        Should have MultiIndex where each level is binary,
        corresponding to category membership.
        If a DataFrame, `sum_over` must be a string or False.
    present : str or list of str, optional
        Category or categories that must be present in subsets for styling.
    absent : str or list of str, optional
        Category or categories that must not be present in subsets for
        styling.
    min_subset_size : int, optional
        Minimum size of a subset to be reported. All subsets with
        a size smaller than this threshold will be omitted from
        category_totals and data.
        Size may be a sum of values, see `subset_size`.
    max_subset_size : int, optional
        Maximum size of a subset to be reported.
    min_degree : int, optional
        Minimum degree of a subset to be reported.
    max_degree : int, optional
        Maximum degree of a subset to be reported.
    sort_by : {'cardinality', 'degree', '-cardinality', '-degree',
               'input', '-input'}
        If 'cardinality', subset are listed from largest to smallest.
        If 'degree', they are listed in order of the number of categories
        intersected. If 'input', the order they appear in the data input is
        used.
        Prefix with '-' to reverse the ordering.

        Note this affects ``subset_sizes`` but not ``data``.
    sort_categories_by : {'cardinality', '-cardinality', 'input', '-input'}
        Whether to sort the categories by total cardinality, or leave them
        in the input data's provided order (order of index levels).
        Prefix with '-' to reverse the ordering.
    subset_size : {'auto', 'count', 'sum'}
        Configures how to calculate the size of a subset. Choices are:

        'auto' (default)
            If `data` is a DataFrame, count the number of rows in each group,
            unless `sum_over` is specified.
            If `data` is a Series with at most one row for each group, use
            the value of the Series. If `data` is a Series with more than one
            row per group, raise a ValueError.
        'count'
            Count the number of rows in each group.
        'sum'
            Sum the value of the `data` Series, or the DataFrame field
            specified by `sum_over`.
    sum_over : str or None
        If `subset_size='sum'` or `'auto'`, then the intersection size is the
        sum of the specified field in the `data` DataFrame. If a Series, only
        None is supported and its value is summed.
    include_empty_subsets : bool (default=False)
        If True, all possible category combinations will be returned in
        subset_sizes, even when some are not present in data.

    Returns
    -------
    QueryResult
        Including filtered ``data``, filtered and sorted ``subset_sizes`` and
        overall ``category_totals``.

    Examples
    --------
    >>> from upsetplot import query, generate_samples
    >>> data = generate_samples(n_samples=20)
    >>> result = query(data, present="cat1", max_subset_size=4)
    >>> result.category_totals
    cat1    14
    cat2     4
    cat0     0
    dtype: int64
    >>> result.subset_sizes
    cat1  cat2  cat0
    True  True  False    3
    Name: size, dtype: int64
    >>> result.data
                     index     value
    cat1 cat2 cat0
    True True False      0  2.04...
              False      2  2.05...
              False     10  2.55...
    >>>
    >>> # Sorting:
    >>> query(data, min_degree=1, sort_by="degree").subset_sizes
    cat1   cat2   cat0
    True   False  False    11
    False  True   False     1
    True   True   False     3
    Name: size, dtype: int64
    >>> query(data, min_degree=1, sort_by="cardinality").subset_sizes
    cat1   cat2   cat0
    True   False  False    11
           True   False     3
    False  True   False     1
    Name: size, dtype: int64
    >>>
    >>> # Getting each subset's data
    >>> result = query(data)
    >>> result.subsets[frozenset({"cat1", "cat2"})]
                index     value
    cat1  cat2 cat0
    False True False      3  1.333795
    >>> result.subsets[frozenset({"cat1"})]
                        index     value
    cat1  cat2  cat0
    False False False      5  0.918174
                False      8  1.948521
                False      9  1.086599
                False     13  1.105696
                False     19  1.339895
    """

    data, agg = _aggregate_data(data, subset_size, sum_over)
    data = _check_index(data)
    totals = [agg[agg.index.get_level_values(name).values.astype(bool)].sum()
              for name in agg.index.names]
    totals = pd.Series(totals, index=agg.index.names)

    if include_empty_subsets:
        nlevels = len(agg.index.levels)
        if nlevels > 10:
            raise ValueError(
                "include_empty_subsets is supported for at most 10 categories")
        new_agg = pd.Series(0,
                            index=pd.MultiIndex.from_product(
                                [[False, True]] * nlevels,
                                names=agg.index.names),
                            dtype=agg.dtype,
                            name=agg.name)
        new_agg.update(agg)
        agg = new_agg

    data, agg = _filter_subsets(data, agg,
                                min_subset_size=min_subset_size,
                                max_subset_size=max_subset_size,
                                min_degree=min_degree,
                                max_degree=max_degree,
                                present=present, absent=absent)

    # sort:
    if sort_categories_by in ('cardinality', '-cardinality'):
        totals.sort_values(ascending=sort_categories_by[:1] == '-',
                           inplace=True)
    elif sort_categories_by == '-input':
        totals = totals[::-1]
    elif sort_categories_by in (None, 'input'):
        pass
    else:
        raise ValueError('Unknown sort_categories_by: %r' % sort_categories_by)
    data = data.reorder_levels(totals.index.values)
    agg = agg.reorder_levels(totals.index.values)

    if sort_by in ('cardinality', '-cardinality'):
        agg = agg.sort_values(ascending=sort_by[:1] == '-')
    elif sort_by in ('degree', '-degree'):
        index_tuples = sorted(agg.index,
                              key=lambda x: (sum(x),) + tuple(reversed(x)),
                              reverse=sort_by[:1] == '-')
        agg = agg.reindex(pd.MultiIndex.from_tuples(index_tuples,
                                                    names=agg.index.names))
    elif sort_by == '-input':
        print("<", agg)
        agg = agg[::-1]
        print(">", agg)
    elif sort_by in (None, 'input'):
        pass
    else:
        raise ValueError('Unknown sort_by: %r' % sort_by)

    return QueryResult(data=data, subset_sizes=agg, category_totals=totals)

1	from __future__ import print_function, division, absolute_import	2✔
2
3	try:	2✔
4	import typing	2✔
5	except ImportError:	×
6	import collections as typing	×
7
8	import numpy as np	2✔
9	import pandas as pd	2✔
10
11
12	def _aggregate_data(df, subset_size, sum_over):	2✔
13	"""
14	Returns
15	-------
16	df : DataFrame
17	full data frame
18	aggregated : Series
19	aggregates
20	"""
21	_SUBSET_SIZE_VALUES = ['auto', 'count', 'sum']	2✔
22	if subset_size not in _SUBSET_SIZE_VALUES:	2✔
23	raise ValueError('subset_size should be one of %s. Got %r'	×
UNCOV 24	% (_SUBSET_SIZE_VALUES, subset_size))	×
25	if df.ndim == 1:	2✔
26	# Series
27	input_name = df.name	2✔
28	df = pd.DataFrame({'_value': df})	2✔
29
30	if subset_size == 'auto' and not df.index.is_unique:	2✔
31	raise ValueError('subset_size="auto" cannot be used for a '	2✔
UNCOV 32	'Series with non-unique groups.')	×
33	if sum_over is not None:	2✔
34	raise ValueError('sum_over is not applicable when the input is a '	2✔
UNCOV 35	'Series')	×
36	if subset_size == 'count':	2✔
37	sum_over = False	2✔
UNCOV 38	else:	×
39	sum_over = '_value'	2✔
UNCOV 40	else:	×
41	# DataFrame
42	if sum_over is False:	2✔
43	raise ValueError('Unsupported value for sum_over: False')	2✔
44	elif subset_size == 'auto' and sum_over is None:	2✔
45	sum_over = False	2✔
46	elif subset_size == 'count':	2✔
47	if sum_over is not None:	2✔
48	raise ValueError('sum_over cannot be set if subset_size=%r' %	2✔
49	subset_size)	2✔
50	sum_over = False	2✔
51	elif subset_size == 'sum':	2✔
52	if sum_over is None:	2✔
53	raise ValueError('sum_over should be a field name if '	2✔
UNCOV 54	'subset_size="sum" and a DataFrame is '	×
UNCOV 55	'provided.')	×
56
57	gb = df.groupby(level=list(range(df.index.nlevels)), sort=False)	2✔
58	if sum_over is False:	2✔
59	aggregated = gb.size()	2✔
60	aggregated.name = 'size'	2✔
61	elif hasattr(sum_over, 'lower'):	2✔
62	aggregated = gb[sum_over].sum()	2✔
UNCOV 63	else:	×
64	raise ValueError('Unsupported value for sum_over: %r' % sum_over)	×
65
66	if aggregated.name == '_value':	2✔
67	aggregated.name = input_name	2✔
68
69	return df, aggregated	2✔
70
71
72	def _check_index(df):	2✔
73	# check all indices are boolean
74	if not all(set([True, False]) >= set(level)	2✔
75	for level in df.index.levels):	2✔
76	raise ValueError('The DataFrame has values in its index that are not '	2✔
UNCOV 77	'boolean')	×
78	df = df.copy(deep=False)	2✔
79	# XXX: this may break if input is not MultiIndex
80	kw = {'levels': [x.astype(bool) for x in df.index.levels],	2✔
81	'names': df.index.names,	2✔
UNCOV 82	}	×
83	if hasattr(df.index, 'codes'):	2✔
84	# compat for pandas <= 0.20
85	kw['codes'] = df.index.codes	2✔
UNCOV 86	else:	×
87	kw['labels'] = df.index.labels	×
88	df.index = pd.MultiIndex(**kw)	2✔
89	return df	2✔
90
91
92	def _scalar_to_list(val):	2✔
93	if not isinstance(val, (typing.Sequence, set)) or isinstance(val, str):	2✔
94	val = [val]	2✔
95	return val	2✔
96
97
98	def _get_subset_mask(agg, min_subset_size, max_subset_size,	2✔
UNCOV 99	min_degree, max_degree,	×
UNCOV 100	present, absent):	×
UNCOV 101	"""Get a mask over subsets based on size, degree or category presence"""	×
102	subset_mask = True	2✔
103	if min_subset_size is not None:	2✔
104	subset_mask = np.logical_and(subset_mask, agg >= min_subset_size)	2✔
105	if max_subset_size is not None:	2✔
106	subset_mask = np.logical_and(subset_mask, agg <= max_subset_size)	2✔
107	if (min_degree is not None and min_degree >= 0) or max_degree is not None:	2✔
108	degree = agg.index.to_frame().sum(axis=1)	2✔
109	if min_degree is not None:	2✔
110	subset_mask = np.logical_and(subset_mask, degree >= min_degree)	2✔
111	if max_degree is not None:	2✔
112	subset_mask = np.logical_and(subset_mask, degree <= max_degree)	2✔
113	if present is not None:	2✔
114	for col in _scalar_to_list(present):	2✔
115	subset_mask = np.logical_and(	2✔
116	subset_mask,	2✔
117	agg.index.get_level_values(col).values)	2✔
118	if absent is not None:	2✔
119	for col in _scalar_to_list(absent):	2✔
120	exclude_mask = np.logical_not(	2✔
121	agg.index.get_level_values(col).values)	2✔
122	subset_mask = np.logical_and(subset_mask, exclude_mask)	2✔
123	return subset_mask	2✔
124
125
126	def _filter_subsets(df, agg,	2✔
UNCOV 127	min_subset_size, max_subset_size,	×
UNCOV 128	min_degree, max_degree,	×
UNCOV 129	present, absent):	×
130	subset_mask = _get_subset_mask(agg,	2✔
131	min_subset_size=min_subset_size,	2✔
132	max_subset_size=max_subset_size,	2✔
133	min_degree=min_degree,	2✔
134	max_degree=max_degree,	2✔
135	present=present, absent=absent)	2✔
136
137	if subset_mask is True:	2✔
138	return df, agg	2✔
139
140	agg = agg[subset_mask]	2✔
141	df = df[df.index.isin(agg.index)]	2✔
142	return df, agg	2✔
143
144
145	class QueryResult:	2✔
146	"""Container for reformatted data and aggregates
147
148	Attributes
149	----------
150	data : DataFrame
151	Selected samples. The index is a MultiIndex with one boolean level for
152	each category.
153	subsets : dict[frozenset, DataFrame]
154	Dataframes for each intersection of categories.
155	subset_sizes : Series
156	Total size of each selected subset as a series. The index is as
157	for `data`.
158	category_totals : Series
159	Total size of each category, regardless of selection.
160	"""
161	def __init__(self, data, subset_sizes, category_totals):	2✔
162	self.data = data	2✔
163	self.subset_sizes = subset_sizes	2✔
164	self.category_totals = category_totals	2✔
165
166	def __repr__(self):	2✔
167	return ("QueryResult(data={data}, subset_sizes={subset_sizes}, "	×
UNCOV 168	"category_totals={category_totals}".format(**vars(self)))	×
169
170	@property	2✔
171	def subsets(self):	1✔
172	categories = np.asarray(self.data.index.names)	2✔
173	return {	2✔
174	frozenset(categories.take(mask)): subset_data	1✔
175	for mask, subset_data	2✔
176	in self.data.groupby(level=list(range(len(categories))),	2✔
177	sort=False)	2✔
UNCOV 178	}	×
179
180
181	def query(data, present=None, absent=None,	1✔
182	min_subset_size=None, max_subset_size=None,	1✔
183	min_degree=None, max_degree=None,	1✔
184	sort_by='degree', sort_categories_by='cardinality',	1✔
185	subset_size='auto', sum_over=None, include_empty_subsets=False):	2✔
UNCOV 186	"""Transform and filter a categorised dataset	×
187
UNCOV 188	Retrieve the set of items and totals corresponding to subsets of interest.	×
189
UNCOV 190	Parameters	×
UNCOV 191	----------	×
UNCOV 192	data : pandas.Series or pandas.DataFrame	×
UNCOV 193	Elements associated with categories (a DataFrame), or the size of each	×
UNCOV 194	subset of categories (a Series).	×
UNCOV 195	Should have MultiIndex where each level is binary,	×
UNCOV 196	corresponding to category membership.	×
UNCOV 197	If a DataFrame, `sum_over` must be a string or False.	×
UNCOV 198	present : str or list of str, optional	×
UNCOV 199	Category or categories that must be present in subsets for styling.	×
UNCOV 200	absent : str or list of str, optional	×
UNCOV 201	Category or categories that must not be present in subsets for	×
UNCOV 202	styling.	×
UNCOV 203	min_subset_size : int, optional	×
UNCOV 204	Minimum size of a subset to be reported. All subsets with	×
UNCOV 205	a size smaller than this threshold will be omitted from	×
UNCOV 206	category_totals and data.	×
UNCOV 207	Size may be a sum of values, see `subset_size`.	×
UNCOV 208	max_subset_size : int, optional	×
UNCOV 209	Maximum size of a subset to be reported.	×
UNCOV 210	min_degree : int, optional	×
UNCOV 211	Minimum degree of a subset to be reported.	×
UNCOV 212	max_degree : int, optional	×
UNCOV 213	Maximum degree of a subset to be reported.	×
UNCOV 214	sort_by : {'cardinality', 'degree', '-cardinality', '-degree',	×
UNCOV 215	'input', '-input'}	×
UNCOV 216	If 'cardinality', subset are listed from largest to smallest.	×
UNCOV 217	If 'degree', they are listed in order of the number of categories	×
UNCOV 218	intersected. If 'input', the order they appear in the data input is	×
UNCOV 219	used.	×
UNCOV 220	Prefix with '-' to reverse the ordering.	×
221
UNCOV 222	Note this affects ``subset_sizes`` but not ``data``.	×
UNCOV 223	sort_categories_by : {'cardinality', '-cardinality', 'input', '-input'}	×
UNCOV 224	Whether to sort the categories by total cardinality, or leave them	×
UNCOV 225	in the input data's provided order (order of index levels).	×
UNCOV 226	Prefix with '-' to reverse the ordering.	×
UNCOV 227	subset_size : {'auto', 'count', 'sum'}	×
UNCOV 228	Configures how to calculate the size of a subset. Choices are:	×
229
UNCOV 230	'auto' (default)	×
UNCOV 231	If `data` is a DataFrame, count the number of rows in each group,	×
UNCOV 232	unless `sum_over` is specified.	×
UNCOV 233	If `data` is a Series with at most one row for each group, use	×
UNCOV 234	the value of the Series. If `data` is a Series with more than one	×
UNCOV 235	row per group, raise a ValueError.	×
UNCOV 236	'count'	×
UNCOV 237	Count the number of rows in each group.	×
UNCOV 238	'sum'	×
UNCOV 239	Sum the value of the `data` Series, or the DataFrame field	×
UNCOV 240	specified by `sum_over`.	×
UNCOV 241	sum_over : str or None	×
UNCOV 242	If `subset_size='sum'` or `'auto'`, then the intersection size is the	×
UNCOV 243	sum of the specified field in the `data` DataFrame. If a Series, only	×
UNCOV 244	None is supported and its value is summed.	×
UNCOV 245	include_empty_subsets : bool (default=False)	×
UNCOV 246	If True, all possible category combinations will be returned in	×
UNCOV 247	subset_sizes, even when some are not present in data.	×
248
UNCOV 249	Returns	×
UNCOV 250	-------	×
UNCOV 251	QueryResult	×
UNCOV 252	Including filtered ``data``, filtered and sorted ``subset_sizes`` and	×
UNCOV 253	overall ``category_totals``.	×
254
UNCOV 255	Examples	×
UNCOV 256	--------	×
UNCOV 257	>>> from upsetplot import query, generate_samples	×
UNCOV 258	>>> data = generate_samples(n_samples=20)	×
UNCOV 259	>>> result = query(data, present="cat1", max_subset_size=4)	×
UNCOV 260	>>> result.category_totals	×
UNCOV 261	cat1 14	×
UNCOV 262	cat2 4	×
UNCOV 263	cat0 0	×
UNCOV 264	dtype: int64	×
UNCOV 265	>>> result.subset_sizes	×
UNCOV 266	cat1 cat2 cat0	×
UNCOV 267	True True False 3	×
UNCOV 268	Name: size, dtype: int64	×
UNCOV 269	>>> result.data	×
UNCOV 270	index value	×
UNCOV 271	cat1 cat2 cat0	×
UNCOV 272	True True False 0 2.04...	×
UNCOV 273	False 2 2.05...	×
UNCOV 274	False 10 2.55...	×
UNCOV 275	>>>	×
UNCOV 276	>>> # Sorting:	×
UNCOV 277	>>> query(data, min_degree=1, sort_by="degree").subset_sizes	×
UNCOV 278	cat1 cat2 cat0	×
UNCOV 279	True False False 11	×
UNCOV 280	False True False 1	×
UNCOV 281	True True False 3	×
UNCOV 282	Name: size, dtype: int64	×
UNCOV 283	>>> query(data, min_degree=1, sort_by="cardinality").subset_sizes	×
UNCOV 284	cat1 cat2 cat0	×
UNCOV 285	True False False 11	×
UNCOV 286	True False 3	×
UNCOV 287	False True False 1	×
UNCOV 288	Name: size, dtype: int64	×
UNCOV 289	>>>	×
UNCOV 290	>>> # Getting each subset's data	×
UNCOV 291	>>> result = query(data)	×
UNCOV 292	>>> result.subsets[frozenset({"cat1", "cat2"})]	×
UNCOV 293	index value	×
UNCOV 294	cat1 cat2 cat0	×
UNCOV 295	False True False 3 1.333795	×
UNCOV 296	>>> result.subsets[frozenset({"cat1"})]	×
UNCOV 297	index value	×
UNCOV 298	cat1 cat2 cat0	×
UNCOV 299	False False False 5 0.918174	×
UNCOV 300	False 8 1.948521	×
UNCOV 301	False 9 1.086599	×
UNCOV 302	False 13 1.105696	×
UNCOV 303	False 19 1.339895	×
UNCOV 304	"""	×
305
306	data, agg = _aggregate_data(data, subset_size, sum_over)	2✔
307	data = _check_index(data)	2✔
308	totals = [agg[agg.index.get_level_values(name).values.astype(bool)].sum()	2✔
309	for name in agg.index.names]	2✔
310	totals = pd.Series(totals, index=agg.index.names)	2✔
311
312	if include_empty_subsets:	2✔
313	nlevels = len(agg.index.levels)	2✔
314	if nlevels > 10:	2✔
315	raise ValueError(
316	"include_empty_subsets is supported for at most 10 categories")
317	new_agg = pd.Series(0,	2✔
318	index=pd.MultiIndex.from_product(	2✔
319	[[False, True]] * nlevels,	2✔
320	names=agg.index.names),	2✔
321	dtype=agg.dtype,	2✔
322	name=agg.name)	2✔
323	new_agg.update(agg)	2✔
324	agg = new_agg	2✔
325
326	data, agg = _filter_subsets(data, agg,	2✔
327	min_subset_size=min_subset_size,	2✔
328	max_subset_size=max_subset_size,	2✔
329	min_degree=min_degree,	2✔
330	max_degree=max_degree,	2✔
331	present=present, absent=absent)	2✔
332
333	# sort:
334	if sort_categories_by in ('cardinality', '-cardinality'):	2✔
335	totals.sort_values(ascending=sort_categories_by[:1] == '-',	2✔
336	inplace=True)	2✔
337	elif sort_categories_by == '-input':	2✔
338	totals = totals[::-1]	2✔
339	elif sort_categories_by in (None, 'input'):	2✔
340	pass	2✔
UNCOV 341	else:	×
342	raise ValueError('Unknown sort_categories_by: %r' % sort_categories_by)	2✔
343	data = data.reorder_levels(totals.index.values)	2✔
344	agg = agg.reorder_levels(totals.index.values)	2✔
345
346	if sort_by in ('cardinality', '-cardinality'):	2✔
347	agg = agg.sort_values(ascending=sort_by[:1] == '-')	2✔
348	elif sort_by in ('degree', '-degree'):	2✔
349	index_tuples = sorted(agg.index,	2✔
350	key=lambda x: (sum(x),) + tuple(reversed(x)),	2✔
351	reverse=sort_by[:1] == '-')	2✔
352	agg = agg.reindex(pd.MultiIndex.from_tuples(index_tuples,	2✔
353	names=agg.index.names))	2✔
354	elif sort_by == '-input':	2✔
355	print("<", agg)	2✔
356	agg = agg[::-1]	2✔
357	print(">", agg)	2✔
358	elif sort_by in (None, 'input'):	2✔
359	pass	2✔
UNCOV 360	else:	×
361	raise ValueError('Unknown sort_by: %r' % sort_by)	2✔
362
363	return QueryResult(data=data, subset_sizes=agg, category_totals=totals)	2✔

jnothman / UpSetPlot / 7342943552

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous