• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jnothman / UpSetPlot / 7348001748

28 Dec 2023 01:15PM UTC coverage: 99.104% (-0.09%) from 99.196%
7348001748

push

github

web-flow
Apply more ruff rules (#251)

38 of 38 new or added lines in 7 files covered. (100.0%)

3 existing lines in 2 files now uncovered.

1660 of 1675 relevant lines covered (99.1%)

0.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.08
/upsetplot/reformat.py
1
import typing
2✔
2

3
import numpy as np
2✔
4
import pandas as pd
2✔
5

6

7
def _aggregate_data(df, subset_size, sum_over):
2✔
8
    """
9
    Returns
10
    -------
11
    df : DataFrame
12
        full data frame
13
    aggregated : Series
14
        aggregates
15
    """
16
    _SUBSET_SIZE_VALUES = ["auto", "count", "sum"]
2✔
17
    if subset_size not in _SUBSET_SIZE_VALUES:
2✔
18
        raise ValueError(
19
            f"subset_size should be one of {_SUBSET_SIZE_VALUES}."
20
            f" Got {repr(subset_size)}"
21
        )
22
    if df.ndim == 1:
2✔
23
        # Series
24
        input_name = df.name
2✔
25
        df = pd.DataFrame({"_value": df})
2✔
26

27
        if subset_size == "auto" and not df.index.is_unique:
2✔
28
            raise ValueError(
2✔
29
                'subset_size="auto" cannot be used for a '
2✔
30
                "Series with non-unique groups."
31
            )
32
        if sum_over is not None:
2✔
33
            raise ValueError("sum_over is not applicable when the input is a " "Series")
2✔
34
        sum_over = False if subset_size == "count" else "_value"
2✔
UNCOV
35
    else:
×
36
        # DataFrame
37
        if sum_over is False:
2✔
38
            raise ValueError("Unsupported value for sum_over: False")
2✔
39
        elif subset_size == "auto" and sum_over is None:
2✔
40
            sum_over = False
2✔
41
        elif subset_size == "count":
2✔
42
            if sum_over is not None:
2✔
43
                raise ValueError(
2✔
44
                    "sum_over cannot be set if subset_size=%r" % subset_size
2✔
45
                )
46
            sum_over = False
2✔
47
        elif subset_size == "sum" and sum_over is None:
2✔
48
            raise ValueError(
2✔
49
                "sum_over should be a field name if "
2✔
50
                'subset_size="sum" and a DataFrame is '
51
                "provided."
52
            )
53

54
    gb = df.groupby(level=list(range(df.index.nlevels)), sort=False)
2✔
55
    if sum_over is False:
2✔
56
        aggregated = gb.size()
2✔
57
        aggregated.name = "size"
2✔
58
    elif hasattr(sum_over, "lower"):
2✔
59
        aggregated = gb[sum_over].sum()
2✔
60
    else:
×
61
        raise ValueError("Unsupported value for sum_over: %r" % sum_over)
×
62

63
    if aggregated.name == "_value":
2✔
64
        aggregated.name = input_name
2✔
65

66
    return df, aggregated
2✔
67

68

69
def _check_index(df):
2✔
70
    # check all indices are boolean
71
    if not all({True, False} >= set(level) for level in df.index.levels):
2✔
72
        raise ValueError(
2✔
73
            "The DataFrame has values in its index that are not " "boolean"
2✔
74
        )
75
    df = df.copy(deep=False)
2✔
76
    # XXX: this may break if input is not MultiIndex
77
    kw = {
2✔
78
        "levels": [x.astype(bool) for x in df.index.levels],
2✔
79
        "names": df.index.names,
2✔
80
    }
81
    if hasattr(df.index, "codes"):
2✔
82
        # compat for pandas <= 0.20
83
        kw["codes"] = df.index.codes
2✔
84
    else:
×
85
        kw["labels"] = df.index.labels
×
86
    df.index = pd.MultiIndex(**kw)
2✔
87
    return df
2✔
88

89

90
def _scalar_to_list(val):
2✔
91
    if not isinstance(val, (typing.Sequence, set)) or isinstance(val, str):
2✔
92
        val = [val]
2✔
93
    return val
2✔
94

95

96
def _get_subset_mask(
2✔
97
    agg, min_subset_size, max_subset_size, min_degree, max_degree, present, absent
98
):
99
    """Get a mask over subsets based on size, degree or category presence"""
100
    subset_mask = True
2✔
101
    if min_subset_size is not None:
2✔
102
        subset_mask = np.logical_and(subset_mask, agg >= min_subset_size)
2✔
103
    if max_subset_size is not None:
2✔
104
        subset_mask = np.logical_and(subset_mask, agg <= max_subset_size)
2✔
105
    if (min_degree is not None and min_degree >= 0) or max_degree is not None:
2✔
106
        degree = agg.index.to_frame().sum(axis=1)
2✔
107
        if min_degree is not None:
2✔
108
            subset_mask = np.logical_and(subset_mask, degree >= min_degree)
2✔
109
        if max_degree is not None:
2✔
110
            subset_mask = np.logical_and(subset_mask, degree <= max_degree)
2✔
111
    if present is not None:
2✔
112
        for col in _scalar_to_list(present):
2✔
113
            subset_mask = np.logical_and(
2✔
114
                subset_mask, agg.index.get_level_values(col).values
2✔
115
            )
116
    if absent is not None:
2✔
117
        for col in _scalar_to_list(absent):
2✔
118
            exclude_mask = np.logical_not(agg.index.get_level_values(col).values)
2✔
119
            subset_mask = np.logical_and(subset_mask, exclude_mask)
2✔
120
    return subset_mask
2✔
121

122

123
def _filter_subsets(
2✔
124
    df, agg, min_subset_size, max_subset_size, min_degree, max_degree, present, absent
125
):
126
    subset_mask = _get_subset_mask(
2✔
127
        agg,
2✔
128
        min_subset_size=min_subset_size,
2✔
129
        max_subset_size=max_subset_size,
2✔
130
        min_degree=min_degree,
2✔
131
        max_degree=max_degree,
2✔
132
        present=present,
2✔
133
        absent=absent,
2✔
134
    )
135

136
    if subset_mask is True:
2✔
137
        return df, agg
2✔
138

139
    agg = agg[subset_mask]
2✔
140
    df = df[df.index.isin(agg.index)]
2✔
141
    return df, agg
2✔
142

143

144
class QueryResult:
2✔
145
    """Container for reformatted data and aggregates
146

147
    Attributes
148
    ----------
149
    data : DataFrame
150
        Selected samples. The index is a MultiIndex with one boolean level for
151
        each category.
152
    subsets : dict[frozenset, DataFrame]
153
        Dataframes for each intersection of categories.
154
    subset_sizes : Series
155
        Total size of each selected subset as a series. The index is as
156
        for `data`.
157
    category_totals : Series
158
        Total size of each category, regardless of selection.
159
    total : number
160
        Total number of samples, or sum of sum_over value.
161
    """
162

163
    def __init__(self, data, subset_sizes, category_totals, total):
2✔
164
        self.data = data
2✔
165
        self.subset_sizes = subset_sizes
2✔
166
        self.category_totals = category_totals
2✔
167
        self.total = total
2✔
168

169
    def __repr__(self):
2✔
170
        return (
171
            "QueryResult(data={data}, subset_sizes={subset_sizes}, "
172
            "category_totals={category_totals}, total={total}".format(**vars(self))
173
        )
174

175
    @property
2✔
176
    def subsets(self):
2✔
177
        categories = np.asarray(self.data.index.names)
2✔
178
        return {
2✔
179
            frozenset(categories.take(mask)): subset_data
2✔
180
            for mask, subset_data in self.data.groupby(
2✔
181
                level=list(range(len(categories))), sort=False
2✔
182
            )
183
        }
184

185

186
def query(
2✔
187
    data,
188
    present=None,
1✔
189
    absent=None,
1✔
190
    min_subset_size=None,
1✔
191
    max_subset_size=None,
1✔
192
    min_degree=None,
1✔
193
    max_degree=None,
1✔
194
    sort_by="degree",
1✔
195
    sort_categories_by="cardinality",
1✔
196
    subset_size="auto",
1✔
197
    sum_over=None,
1✔
198
    include_empty_subsets=False,
1✔
199
):
200
    """Transform and filter a categorised dataset
201

202
    Retrieve the set of items and totals corresponding to subsets of interest.
203

204
    Parameters
205
    ----------
206
    data : pandas.Series or pandas.DataFrame
207
        Elements associated with categories (a DataFrame), or the size of each
208
        subset of categories (a Series).
209
        Should have MultiIndex where each level is binary,
210
        corresponding to category membership.
211
        If a DataFrame, `sum_over` must be a string or False.
212
    present : str or list of str, optional
213
        Category or categories that must be present in subsets for styling.
214
    absent : str or list of str, optional
215
        Category or categories that must not be present in subsets for
216
        styling.
217
    min_subset_size : int, optional
218
        Minimum size of a subset to be reported. All subsets with
219
        a size smaller than this threshold will be omitted from
220
        category_totals and data.
221
        Size may be a sum of values, see `subset_size`.
222
    max_subset_size : int, optional
223
        Maximum size of a subset to be reported.
224
    min_degree : int, optional
225
        Minimum degree of a subset to be reported.
226
    max_degree : int, optional
227
        Maximum degree of a subset to be reported.
228
    sort_by : {'cardinality', 'degree', '-cardinality', '-degree',
229
               'input', '-input'}
230
        If 'cardinality', subset are listed from largest to smallest.
231
        If 'degree', they are listed in order of the number of categories
232
        intersected. If 'input', the order they appear in the data input is
233
        used.
234
        Prefix with '-' to reverse the ordering.
235

236
        Note this affects ``subset_sizes`` but not ``data``.
237
    sort_categories_by : {'cardinality', '-cardinality', 'input', '-input'}
238
        Whether to sort the categories by total cardinality, or leave them
239
        in the input data's provided order (order of index levels).
240
        Prefix with '-' to reverse the ordering.
241
    subset_size : {'auto', 'count', 'sum'}
242
        Configures how to calculate the size of a subset. Choices are:
243

244
        'auto' (default)
245
            If `data` is a DataFrame, count the number of rows in each group,
246
            unless `sum_over` is specified.
247
            If `data` is a Series with at most one row for each group, use
248
            the value of the Series. If `data` is a Series with more than one
249
            row per group, raise a ValueError.
250
        'count'
251
            Count the number of rows in each group.
252
        'sum'
253
            Sum the value of the `data` Series, or the DataFrame field
254
            specified by `sum_over`.
255
    sum_over : str or None
256
        If `subset_size='sum'` or `'auto'`, then the intersection size is the
257
        sum of the specified field in the `data` DataFrame. If a Series, only
258
        None is supported and its value is summed.
259
    include_empty_subsets : bool (default=False)
260
        If True, all possible category combinations will be returned in
261
        subset_sizes, even when some are not present in data.
262

263
    Returns
264
    -------
265
    QueryResult
266
        Including filtered ``data``, filtered and sorted ``subset_sizes`` and
267
        overall ``category_totals`` and ``total``.
268

269
    Examples
270
    --------
271
    >>> from upsetplot import query, generate_samples
272
    >>> data = generate_samples(n_samples=20)
273
    >>> result = query(data, present="cat1", max_subset_size=4)
274
    >>> result.category_totals
275
    cat1    14
276
    cat2     4
277
    cat0     0
278
    dtype: int64
279
    >>> result.subset_sizes
280
    cat1  cat2  cat0
281
    True  True  False    3
282
    Name: size, dtype: int64
283
    >>> result.data
284
                     index     value
285
    cat1 cat2 cat0
286
    True True False      0  2.04...
287
              False      2  2.05...
288
              False     10  2.55...
289
    >>>
290
    >>> # Sorting:
291
    >>> query(data, min_degree=1, sort_by="degree").subset_sizes
292
    cat1   cat2   cat0
293
    True   False  False    11
294
    False  True   False     1
295
    True   True   False     3
296
    Name: size, dtype: int64
297
    >>> query(data, min_degree=1, sort_by="cardinality").subset_sizes
298
    cat1   cat2   cat0
299
    True   False  False    11
300
           True   False     3
301
    False  True   False     1
302
    Name: size, dtype: int64
303
    >>>
304
    >>> # Getting each subset's data
305
    >>> result = query(data)
306
    >>> result.subsets[frozenset({"cat1", "cat2"})]
307
                index     value
308
    cat1  cat2 cat0
309
    False True False      3  1.333795
310
    >>> result.subsets[frozenset({"cat1"})]
311
                        index     value
312
    cat1  cat2  cat0
313
    False False False      5  0.918174
314
                False      8  1.948521
315
                False      9  1.086599
316
                False     13  1.105696
317
                False     19  1.339895
318
    """
319

320
    data, agg = _aggregate_data(data, subset_size, sum_over)
2✔
321
    data = _check_index(data)
2✔
322
    grand_total = agg.sum()
2✔
323
    category_totals = [
2✔
324
        agg[agg.index.get_level_values(name).values.astype(bool)].sum()
2✔
325
        for name in agg.index.names
2✔
326
    ]
327
    category_totals = pd.Series(category_totals, index=agg.index.names)
2✔
328

329
    if include_empty_subsets:
2✔
330
        nlevels = len(agg.index.levels)
2✔
331
        if nlevels > 10:
2✔
332
            raise ValueError(
333
                "include_empty_subsets is supported for at most 10 categories"
334
            )
335
        new_agg = pd.Series(
2✔
336
            0,
2✔
337
            index=pd.MultiIndex.from_product(
2✔
338
                [[False, True]] * nlevels, names=agg.index.names
2✔
339
            ),
340
            dtype=agg.dtype,
2✔
341
            name=agg.name,
2✔
342
        )
343
        new_agg.update(agg)
2✔
344
        agg = new_agg
2✔
345

346
    data, agg = _filter_subsets(
2✔
347
        data,
2✔
348
        agg,
2✔
349
        min_subset_size=min_subset_size,
2✔
350
        max_subset_size=max_subset_size,
2✔
351
        min_degree=min_degree,
2✔
352
        max_degree=max_degree,
2✔
353
        present=present,
2✔
354
        absent=absent,
2✔
355
    )
356

357
    # sort:
358
    if sort_categories_by in ("cardinality", "-cardinality"):
2✔
359
        category_totals.sort_values(
2✔
360
            ascending=sort_categories_by[:1] == "-", inplace=True
2✔
361
        )
362
    elif sort_categories_by == "-input":
2✔
363
        category_totals = category_totals[::-1]
2✔
364
    elif sort_categories_by in (None, "input"):
2✔
365
        pass
2✔
366
    else:
367
        raise ValueError("Unknown sort_categories_by: %r" % sort_categories_by)
2✔
368
    data = data.reorder_levels(category_totals.index.values)
2✔
369
    agg = agg.reorder_levels(category_totals.index.values)
2✔
370

371
    if sort_by in ("cardinality", "-cardinality"):
2✔
372
        agg = agg.sort_values(ascending=sort_by[:1] == "-")
2✔
373
    elif sort_by in ("degree", "-degree"):
2✔
374
        index_tuples = sorted(
2✔
375
            agg.index,
2✔
376
            key=lambda x: (sum(x),) + tuple(reversed(x)),
2✔
377
            reverse=sort_by[:1] == "-",
2✔
378
        )
379
        agg = agg.reindex(
2✔
380
            pd.MultiIndex.from_tuples(index_tuples, names=agg.index.names)
2✔
381
        )
382
    elif sort_by == "-input":
2✔
383
        agg = agg[::-1]
2✔
384
    elif sort_by in (None, "input"):
2✔
385
        pass
2✔
386
    else:
387
        raise ValueError("Unknown sort_by: %r" % sort_by)
2✔
388

389
    return QueryResult(
2✔
390
        data=data, subset_sizes=agg, category_totals=category_totals, total=grand_total
2✔
391
    )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc