• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jnothman / UpSetPlot / 7342943552

28 Dec 2023 12:13AM UTC coverage: 83.549% (-14.0%) from 97.551%
7342943552

push

github

web-flow
Fix warning due to styling dtyles, and fix column dtype test failure (#238)


Fixes #225

6 of 6 new or added lines in 2 files covered. (100.0%)

312 existing lines in 7 files now uncovered.

1681 of 2012 relevant lines covered (83.55%)

1.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

52.7
/upsetplot/reformat.py
1
from __future__ import print_function, division, absolute_import
2✔
2

3
try:
2✔
4
    import typing
2✔
5
except ImportError:
×
6
    import collections as typing
×
7

8
import numpy as np
2✔
9
import pandas as pd
2✔
10

11

12
def _aggregate_data(df, subset_size, sum_over):
2✔
13
    """
14
    Returns
15
    -------
16
    df : DataFrame
17
        full data frame
18
    aggregated : Series
19
        aggregates
20
    """
21
    _SUBSET_SIZE_VALUES = ['auto', 'count', 'sum']
2✔
22
    if subset_size not in _SUBSET_SIZE_VALUES:
2✔
23
        raise ValueError('subset_size should be one of %s. Got %r'
×
UNCOV
24
                         % (_SUBSET_SIZE_VALUES, subset_size))
×
25
    if df.ndim == 1:
2✔
26
        # Series
27
        input_name = df.name
2✔
28
        df = pd.DataFrame({'_value': df})
2✔
29

30
        if subset_size == 'auto' and not df.index.is_unique:
2✔
31
            raise ValueError('subset_size="auto" cannot be used for a '
2✔
UNCOV
32
                             'Series with non-unique groups.')
×
33
        if sum_over is not None:
2✔
34
            raise ValueError('sum_over is not applicable when the input is a '
2✔
UNCOV
35
                             'Series')
×
36
        if subset_size == 'count':
2✔
37
            sum_over = False
2✔
UNCOV
38
        else:
×
39
            sum_over = '_value'
2✔
UNCOV
40
    else:
×
41
        # DataFrame
42
        if sum_over is False:
2✔
43
            raise ValueError('Unsupported value for sum_over: False')
2✔
44
        elif subset_size == 'auto' and sum_over is None:
2✔
45
            sum_over = False
2✔
46
        elif subset_size == 'count':
2✔
47
            if sum_over is not None:
2✔
48
                raise ValueError('sum_over cannot be set if subset_size=%r' %
2✔
49
                                 subset_size)
2✔
50
            sum_over = False
2✔
51
        elif subset_size == 'sum':
2✔
52
            if sum_over is None:
2✔
53
                raise ValueError('sum_over should be a field name if '
2✔
UNCOV
54
                                 'subset_size="sum" and a DataFrame is '
×
UNCOV
55
                                 'provided.')
×
56

57
    gb = df.groupby(level=list(range(df.index.nlevels)), sort=False)
2✔
58
    if sum_over is False:
2✔
59
        aggregated = gb.size()
2✔
60
        aggregated.name = 'size'
2✔
61
    elif hasattr(sum_over, 'lower'):
2✔
62
        aggregated = gb[sum_over].sum()
2✔
UNCOV
63
    else:
×
64
        raise ValueError('Unsupported value for sum_over: %r' % sum_over)
×
65

66
    if aggregated.name == '_value':
2✔
67
        aggregated.name = input_name
2✔
68

69
    return df, aggregated
2✔
70

71

72
def _check_index(df):
2✔
73
    # check all indices are boolean
74
    if not all(set([True, False]) >= set(level)
2✔
75
               for level in df.index.levels):
2✔
76
        raise ValueError('The DataFrame has values in its index that are not '
2✔
UNCOV
77
                         'boolean')
×
78
    df = df.copy(deep=False)
2✔
79
    # XXX: this may break if input is not MultiIndex
80
    kw = {'levels': [x.astype(bool) for x in df.index.levels],
2✔
81
          'names': df.index.names,
2✔
UNCOV
82
          }
×
83
    if hasattr(df.index, 'codes'):
2✔
84
        # compat for pandas <= 0.20
85
        kw['codes'] = df.index.codes
2✔
UNCOV
86
    else:
×
87
        kw['labels'] = df.index.labels
×
88
    df.index = pd.MultiIndex(**kw)
2✔
89
    return df
2✔
90

91

92
def _scalar_to_list(val):
2✔
93
    if not isinstance(val, (typing.Sequence, set)) or isinstance(val, str):
2✔
94
        val = [val]
2✔
95
    return val
2✔
96

97

98
def _get_subset_mask(agg, min_subset_size, max_subset_size,
2✔
UNCOV
99
                     min_degree, max_degree,
×
UNCOV
100
                     present, absent):
×
UNCOV
101
    """Get a mask over subsets based on size, degree or category presence"""
×
102
    subset_mask = True
2✔
103
    if min_subset_size is not None:
2✔
104
        subset_mask = np.logical_and(subset_mask, agg >= min_subset_size)
2✔
105
    if max_subset_size is not None:
2✔
106
        subset_mask = np.logical_and(subset_mask, agg <= max_subset_size)
2✔
107
    if (min_degree is not None and min_degree >= 0) or max_degree is not None:
2✔
108
        degree = agg.index.to_frame().sum(axis=1)
2✔
109
        if min_degree is not None:
2✔
110
            subset_mask = np.logical_and(subset_mask, degree >= min_degree)
2✔
111
        if max_degree is not None:
2✔
112
            subset_mask = np.logical_and(subset_mask, degree <= max_degree)
2✔
113
    if present is not None:
2✔
114
        for col in _scalar_to_list(present):
2✔
115
            subset_mask = np.logical_and(
2✔
116
                subset_mask,
2✔
117
                agg.index.get_level_values(col).values)
2✔
118
    if absent is not None:
2✔
119
        for col in _scalar_to_list(absent):
2✔
120
            exclude_mask = np.logical_not(
2✔
121
                agg.index.get_level_values(col).values)
2✔
122
            subset_mask = np.logical_and(subset_mask, exclude_mask)
2✔
123
    return subset_mask
2✔
124

125

126
def _filter_subsets(df, agg,
2✔
UNCOV
127
                    min_subset_size, max_subset_size,
×
UNCOV
128
                    min_degree, max_degree,
×
UNCOV
129
                    present, absent):
×
130
    subset_mask = _get_subset_mask(agg,
2✔
131
                                   min_subset_size=min_subset_size,
2✔
132
                                   max_subset_size=max_subset_size,
2✔
133
                                   min_degree=min_degree,
2✔
134
                                   max_degree=max_degree,
2✔
135
                                   present=present, absent=absent)
2✔
136

137
    if subset_mask is True:
2✔
138
        return df, agg
2✔
139

140
    agg = agg[subset_mask]
2✔
141
    df = df[df.index.isin(agg.index)]
2✔
142
    return df, agg
2✔
143

144

145
class QueryResult:
2✔
146
    """Container for reformatted data and aggregates
147

148
    Attributes
149
    ----------
150
    data : DataFrame
151
        Selected samples. The index is a MultiIndex with one boolean level for
152
        each category.
153
    subsets : dict[frozenset, DataFrame]
154
        Dataframes for each intersection of categories.
155
    subset_sizes : Series
156
        Total size of each selected subset as a series. The index is as
157
        for `data`.
158
    category_totals : Series
159
        Total size of each category, regardless of selection.
160
    """
161
    def __init__(self, data, subset_sizes, category_totals):
2✔
162
        self.data = data
2✔
163
        self.subset_sizes = subset_sizes
2✔
164
        self.category_totals = category_totals
2✔
165

166
    def __repr__(self):
2✔
167
        return ("QueryResult(data={data}, subset_sizes={subset_sizes}, "
×
UNCOV
168
                "category_totals={category_totals}".format(**vars(self)))
×
169

170
    @property
2✔
171
    def subsets(self):
1✔
172
        categories = np.asarray(self.data.index.names)
2✔
173
        return {
2✔
174
            frozenset(categories.take(mask)): subset_data
1✔
175
            for mask, subset_data
2✔
176
            in self.data.groupby(level=list(range(len(categories))),
2✔
177
                                 sort=False)
2✔
UNCOV
178
        }
×
179

180

181
def query(data, present=None, absent=None,
1✔
182
          min_subset_size=None, max_subset_size=None,
1✔
183
          min_degree=None, max_degree=None,
1✔
184
          sort_by='degree', sort_categories_by='cardinality',
1✔
185
          subset_size='auto', sum_over=None, include_empty_subsets=False):
2✔
UNCOV
186
    """Transform and filter a categorised dataset
×
187

UNCOV
188
    Retrieve the set of items and totals corresponding to subsets of interest.
×
189

UNCOV
190
    Parameters
×
UNCOV
191
    ----------
×
UNCOV
192
    data : pandas.Series or pandas.DataFrame
×
UNCOV
193
        Elements associated with categories (a DataFrame), or the size of each
×
UNCOV
194
        subset of categories (a Series).
×
UNCOV
195
        Should have MultiIndex where each level is binary,
×
UNCOV
196
        corresponding to category membership.
×
UNCOV
197
        If a DataFrame, `sum_over` must be a string or False.
×
UNCOV
198
    present : str or list of str, optional
×
UNCOV
199
        Category or categories that must be present in subsets for styling.
×
UNCOV
200
    absent : str or list of str, optional
×
UNCOV
201
        Category or categories that must not be present in subsets for
×
UNCOV
202
        styling.
×
UNCOV
203
    min_subset_size : int, optional
×
UNCOV
204
        Minimum size of a subset to be reported. All subsets with
×
UNCOV
205
        a size smaller than this threshold will be omitted from
×
UNCOV
206
        category_totals and data.
×
UNCOV
207
        Size may be a sum of values, see `subset_size`.
×
UNCOV
208
    max_subset_size : int, optional
×
UNCOV
209
        Maximum size of a subset to be reported.
×
UNCOV
210
    min_degree : int, optional
×
UNCOV
211
        Minimum degree of a subset to be reported.
×
UNCOV
212
    max_degree : int, optional
×
UNCOV
213
        Maximum degree of a subset to be reported.
×
UNCOV
214
    sort_by : {'cardinality', 'degree', '-cardinality', '-degree',
×
UNCOV
215
               'input', '-input'}
×
UNCOV
216
        If 'cardinality', subset are listed from largest to smallest.
×
UNCOV
217
        If 'degree', they are listed in order of the number of categories
×
UNCOV
218
        intersected. If 'input', the order they appear in the data input is
×
UNCOV
219
        used.
×
UNCOV
220
        Prefix with '-' to reverse the ordering.
×
221

UNCOV
222
        Note this affects ``subset_sizes`` but not ``data``.
×
UNCOV
223
    sort_categories_by : {'cardinality', '-cardinality', 'input', '-input'}
×
UNCOV
224
        Whether to sort the categories by total cardinality, or leave them
×
UNCOV
225
        in the input data's provided order (order of index levels).
×
UNCOV
226
        Prefix with '-' to reverse the ordering.
×
UNCOV
227
    subset_size : {'auto', 'count', 'sum'}
×
UNCOV
228
        Configures how to calculate the size of a subset. Choices are:
×
229

UNCOV
230
        'auto' (default)
×
UNCOV
231
            If `data` is a DataFrame, count the number of rows in each group,
×
UNCOV
232
            unless `sum_over` is specified.
×
UNCOV
233
            If `data` is a Series with at most one row for each group, use
×
UNCOV
234
            the value of the Series. If `data` is a Series with more than one
×
UNCOV
235
            row per group, raise a ValueError.
×
UNCOV
236
        'count'
×
UNCOV
237
            Count the number of rows in each group.
×
UNCOV
238
        'sum'
×
UNCOV
239
            Sum the value of the `data` Series, or the DataFrame field
×
UNCOV
240
            specified by `sum_over`.
×
UNCOV
241
    sum_over : str or None
×
UNCOV
242
        If `subset_size='sum'` or `'auto'`, then the intersection size is the
×
UNCOV
243
        sum of the specified field in the `data` DataFrame. If a Series, only
×
UNCOV
244
        None is supported and its value is summed.
×
UNCOV
245
    include_empty_subsets : bool (default=False)
×
UNCOV
246
        If True, all possible category combinations will be returned in
×
UNCOV
247
        subset_sizes, even when some are not present in data.
×
248

UNCOV
249
    Returns
×
UNCOV
250
    -------
×
UNCOV
251
    QueryResult
×
UNCOV
252
        Including filtered ``data``, filtered and sorted ``subset_sizes`` and
×
UNCOV
253
        overall ``category_totals``.
×
254

UNCOV
255
    Examples
×
UNCOV
256
    --------
×
UNCOV
257
    >>> from upsetplot import query, generate_samples
×
UNCOV
258
    >>> data = generate_samples(n_samples=20)
×
UNCOV
259
    >>> result = query(data, present="cat1", max_subset_size=4)
×
UNCOV
260
    >>> result.category_totals
×
UNCOV
261
    cat1    14
×
UNCOV
262
    cat2     4
×
UNCOV
263
    cat0     0
×
UNCOV
264
    dtype: int64
×
UNCOV
265
    >>> result.subset_sizes
×
UNCOV
266
    cat1  cat2  cat0
×
UNCOV
267
    True  True  False    3
×
UNCOV
268
    Name: size, dtype: int64
×
UNCOV
269
    >>> result.data
×
UNCOV
270
                     index     value
×
UNCOV
271
    cat1 cat2 cat0
×
UNCOV
272
    True True False      0  2.04...
×
UNCOV
273
              False      2  2.05...
×
UNCOV
274
              False     10  2.55...
×
UNCOV
275
    >>>
×
UNCOV
276
    >>> # Sorting:
×
UNCOV
277
    >>> query(data, min_degree=1, sort_by="degree").subset_sizes
×
UNCOV
278
    cat1   cat2   cat0
×
UNCOV
279
    True   False  False    11
×
UNCOV
280
    False  True   False     1
×
UNCOV
281
    True   True   False     3
×
UNCOV
282
    Name: size, dtype: int64
×
UNCOV
283
    >>> query(data, min_degree=1, sort_by="cardinality").subset_sizes
×
UNCOV
284
    cat1   cat2   cat0
×
UNCOV
285
    True   False  False    11
×
UNCOV
286
           True   False     3
×
UNCOV
287
    False  True   False     1
×
UNCOV
288
    Name: size, dtype: int64
×
UNCOV
289
    >>>
×
UNCOV
290
    >>> # Getting each subset's data
×
UNCOV
291
    >>> result = query(data)
×
UNCOV
292
    >>> result.subsets[frozenset({"cat1", "cat2"})]
×
UNCOV
293
                index     value
×
UNCOV
294
    cat1  cat2 cat0
×
UNCOV
295
    False True False      3  1.333795
×
UNCOV
296
    >>> result.subsets[frozenset({"cat1"})]
×
UNCOV
297
                        index     value
×
UNCOV
298
    cat1  cat2  cat0
×
UNCOV
299
    False False False      5  0.918174
×
UNCOV
300
                False      8  1.948521
×
UNCOV
301
                False      9  1.086599
×
UNCOV
302
                False     13  1.105696
×
UNCOV
303
                False     19  1.339895
×
UNCOV
304
    """
×
305

306
    data, agg = _aggregate_data(data, subset_size, sum_over)
2✔
307
    data = _check_index(data)
2✔
308
    totals = [agg[agg.index.get_level_values(name).values.astype(bool)].sum()
2✔
309
              for name in agg.index.names]
2✔
310
    totals = pd.Series(totals, index=agg.index.names)
2✔
311

312
    if include_empty_subsets:
2✔
313
        nlevels = len(agg.index.levels)
2✔
314
        if nlevels > 10:
2✔
315
            raise ValueError(
316
                "include_empty_subsets is supported for at most 10 categories")
317
        new_agg = pd.Series(0,
2✔
318
                            index=pd.MultiIndex.from_product(
2✔
319
                                [[False, True]] * nlevels,
2✔
320
                                names=agg.index.names),
2✔
321
                            dtype=agg.dtype,
2✔
322
                            name=agg.name)
2✔
323
        new_agg.update(agg)
2✔
324
        agg = new_agg
2✔
325

326
    data, agg = _filter_subsets(data, agg,
2✔
327
                                min_subset_size=min_subset_size,
2✔
328
                                max_subset_size=max_subset_size,
2✔
329
                                min_degree=min_degree,
2✔
330
                                max_degree=max_degree,
2✔
331
                                present=present, absent=absent)
2✔
332

333
    # sort:
334
    if sort_categories_by in ('cardinality', '-cardinality'):
2✔
335
        totals.sort_values(ascending=sort_categories_by[:1] == '-',
2✔
336
                           inplace=True)
2✔
337
    elif sort_categories_by == '-input':
2✔
338
        totals = totals[::-1]
2✔
339
    elif sort_categories_by in (None, 'input'):
2✔
340
        pass
2✔
UNCOV
341
    else:
×
342
        raise ValueError('Unknown sort_categories_by: %r' % sort_categories_by)
2✔
343
    data = data.reorder_levels(totals.index.values)
2✔
344
    agg = agg.reorder_levels(totals.index.values)
2✔
345

346
    if sort_by in ('cardinality', '-cardinality'):
2✔
347
        agg = agg.sort_values(ascending=sort_by[:1] == '-')
2✔
348
    elif sort_by in ('degree', '-degree'):
2✔
349
        index_tuples = sorted(agg.index,
2✔
350
                              key=lambda x: (sum(x),) + tuple(reversed(x)),
2✔
351
                              reverse=sort_by[:1] == '-')
2✔
352
        agg = agg.reindex(pd.MultiIndex.from_tuples(index_tuples,
2✔
353
                                                    names=agg.index.names))
2✔
354
    elif sort_by == '-input':
2✔
355
        print("<", agg)
2✔
356
        agg = agg[::-1]
2✔
357
        print(">", agg)
2✔
358
    elif sort_by in (None, 'input'):
2✔
359
        pass
2✔
UNCOV
360
    else:
×
361
        raise ValueError('Unknown sort_by: %r' % sort_by)
2✔
362

363
    return QueryResult(data=data, subset_sizes=agg, category_totals=totals)
2✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc