• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jnothman / UpSetPlot / 7344254321

28 Dec 2023 03:58AM UTC coverage: 98.586% (+15.0%) from 83.549%
7344254321

push

github

web-flow
Format with black/ruff (#240)

844 of 848 new or added lines in 8 files covered. (99.53%)

4 existing lines in 3 files now uncovered.

1534 of 1556 relevant lines covered (98.59%)

0.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.67
/upsetplot/data.py
1
from __future__ import print_function, division, absolute_import
2✔
2
from numbers import Number
2✔
3
import functools
2✔
4
from distutils.version import LooseVersion
2✔
5
import warnings
2✔
6

2✔
7
import pandas as pd
2✔
8
import numpy as np
2✔
9

10

11
def generate_samples(seed=0, n_samples=10000, n_categories=3):
2✔
12
    """Generate artificial samples assigned to set intersections
13

14
    Parameters
15
    ----------
16
    seed : int
17
        A seed for randomisation
18
    n_samples : int
19
        Number of samples to generate
20
    n_categories : int
21
        Number of categories (named "cat0", "cat1", ...) to generate
22

23
    Returns
24
    -------
25
    DataFrame
26
        Field 'value' is a weight or score for each element.
27
        Field 'index' is a unique id for each element.
28
        Index includes a boolean indicator mask for each category.
29

30
        Note: Further fields may be added in future versions.
31

32
    See Also
33
    --------
34
    generate_counts : Generates the counts for each subset of categories
35
        corresponding to these samples.
36
    """
37
    rng = np.random.RandomState(seed)
2✔
38
    df = pd.DataFrame({"value": np.zeros(n_samples)})
2✔
39
    for i in range(n_categories):
2✔
40
        r = rng.rand(n_samples)
2✔
41
        df["cat%d" % i] = r > rng.rand()
2✔
42
        df["value"] += r
2✔
43

44
    df.reset_index(inplace=True)
2✔
45
    df.set_index(["cat%d" % i for i in range(n_categories)], inplace=True)
2✔
46
    return df
2✔
47

48

49
def generate_counts(seed=0, n_samples=10000, n_categories=3):
2✔
50
    """Generate artificial counts corresponding to set intersections
51

52
    Parameters
53
    ----------
54
    seed : int
55
        A seed for randomisation
56
    n_samples : int
57
        Number of samples to generate statistics over
58
    n_categories : int
59
        Number of categories (named "cat0", "cat1", ...) to generate
60

61
    Returns
62
    -------
63
    Series
64
        Counts indexed by boolean indicator mask for each category.
65

66
    See Also
67
    --------
68
    generate_samples : Generates a DataFrame of samples that these counts are
69
        derived from.
70
    """
71
    df = generate_samples(seed=seed, n_samples=n_samples, n_categories=n_categories)
2✔
72
    return df.value.groupby(level=list(range(n_categories))).count()
2✔
73

74

75
def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False):
2✔
76
    warnings.warn(
2✔
77
        "generate_data was replaced by generate_counts in version "
2✔
78
        "0.3 and will be removed in version 0.4.",
79
        DeprecationWarning,
2✔
80
    )
81
    if aggregated:
2✔
NEW
82
        return generate_counts(seed=seed, n_samples=n_samples, n_categories=n_sets)
×
83
    else:
×
84
        return generate_samples(seed=seed, n_samples=n_samples, n_categories=n_sets)[
2✔
85
            "value"
2✔
86
        ]
87

88

89
def from_indicators(indicators, data=None):
2✔
90
    """Load category membership indicated by a boolean indicator matrix
91

92
    This loader also supports the case where the indicator columns can be
93
    derived from `data`.
94

95
    .. versionadded:: 0.6
96

97
    Parameters
98
    ----------
99
    indicators : DataFrame-like of booleans, Sequence of str, or callable
100
        Specifies the category indicators (boolean mask arrays) within
101
        ``data``, i.e. which records in ``data`` belong to which categories.
102

103
        If a list of strings, these should be column names found in ``data``
104
        whose values are boolean mask arrays.
105

106
        If a DataFrame, its columns should correspond to categories, and its
107
        index should be a subset of those in ``data``, values should be True
108
        where a data record is in that category, and False or NA otherwise.
109

110
        If callable, it will be applied to ``data`` after the latter is
111
        converted to a Series or DataFrame.
112

113
    data : Series-like or DataFrame-like, optional
114
        If given, the index of category membership is attached to this data.
115
        It must have the same length as `indicators`.
116
        If not given, the series will contain the value 1.
117

118
    Returns
119
    -------
120
    DataFrame or Series
121
        `data` is returned with its index indicating category membership.
122
        It will be a Series if `data` is a Series or 1d numeric array or None.
123

124
    Notes
125
    -----
126
    Categories with indicators that are all False will be removed.
127

128
    Examples
129
    --------
130
    >>> import pandas as pd
131
    >>> from upsetplot import from_indicators
132
    >>>
133
    >>> # Just indicators:
134
    >>> indicators = {"cat1": [True, False, True, False],
135
    ...               "cat2": [False, True, False, False],
136
    ...               "cat3": [True, True, False, False]}
137
    >>> from_indicators(indicators)
138
    cat1   cat2   cat3
139
    True   False  True     1.0
140
    False  True   True     1.0
141
    True   False  False    1.0
142
    False  False  False    1.0
143
    Name: ones, dtype: float64
144
    >>>
145
    >>> # Where indicators are included within data, specifying
146
    >>> # columns by name:
147
    >>> data = pd.DataFrame({"value": [5, 4, 6, 4], **indicators})
148
    >>> from_indicators(["cat1", "cat3"], data=data)
149
                 value   cat1   cat2   cat3
150
    cat1  cat3
151
    True  True       5   True  False   True
152
    False True       4  False   True   True
153
    True  False      6   True  False  False
154
    False False      4  False  False  False
155
    >>>
156
    >>> # Making indicators out of all boolean columns:
157
    >>> from_indicators(lambda data: data.select_dtypes(bool), data=data)
158
                       value   cat1   cat2   cat3
159
    cat1  cat2  cat3
160
    True  False True       5   True  False   True
161
    False True  True       4  False   True   True
162
    True  False False      6   True  False  False
163
    False False False      4  False  False  False
164
    >>>
165
    >>> # Using a dataset with missing data, we can use missingness as
166
    >>> # an indicator:
167
    >>> data = pd.DataFrame({"val1": [pd.NA, .7, pd.NA, .9],
168
    ...                      "val2": ["male", pd.NA, "female", "female"],
169
    ...                      "val3": [pd.NA, pd.NA, 23000, 78000]})
170
    >>> from_indicators(pd.isna, data=data)
171
                       val1    val2   val3
172
    val1  val2  val3
173
    True  False True   <NA>    male   <NA>
174
    False True  True    0.7    <NA>   <NA>
175
    True  False False  <NA>  female  23000
176
    False False False   0.9  female  78000
177
    """
178
    if data is not None:
2✔
179
        data = _convert_to_pandas(data)
2✔
180

181
    if callable(indicators):
2✔
182
        if data is None:
2✔
183
            raise ValueError("data must be provided when indicators is " "callable")
2✔
184
        indicators = indicators(data)
2✔
185

186
    try:
2✔
187
        indicators[0]
2✔
188
    except Exception:
2✔
189
        pass
2✔
190
    else:
×
191
        if isinstance(indicators[0], (str, int)):
2✔
192
            if data is None:
2✔
193
                raise ValueError(
2✔
194
                    "data must be provided when indicators are "
2✔
195
                    "specified as a list of columns"
196
                )
197
            if isinstance(indicators, tuple):
2✔
198
                raise ValueError("indicators as tuple is not supported")
2✔
199
            # column array
200
            indicators = data[indicators]
2✔
201

202
    indicators = pd.DataFrame(indicators).fillna(False).infer_objects()
2✔
203
    # drop all-False (should we be dropping all-True also? making an option?)
204
    indicators = indicators.loc[:, indicators.any(axis=0)]
2✔
205

206
    if not all(dtype.kind == "b" for dtype in indicators.dtypes):
2✔
207
        raise ValueError("The indicators must all be boolean")
2✔
208

209
    if data is not None:
2✔
210
        if not (
1✔
211
            isinstance(indicators.index, pd.RangeIndex)
2✔
212
            and indicators.index[0] == 0
2✔
213
            and indicators.index[-1] == len(data) - 1
2✔
214
        ):
215
            # index is specified on indicators. Need to align it to data
216
            if not indicators.index.isin(data.index).all():
2✔
217
                raise ValueError(
2✔
218
                    "If indicators.index is not the default, "
2✔
219
                    "all its values must be present in "
220
                    "data.index"
221
                )
222
            indicators = indicators.reindex(index=data.index, fill_value=False)
2✔
223
    else:
224
        data = pd.Series(np.ones(len(indicators)), name="ones")
2✔
225

226
    indicators.set_index(list(indicators.columns), inplace=True)
2✔
227
    data.index = indicators.index
2✔
228

229
    return data
2✔
230

231

232
def _convert_to_pandas(data, copy=True):
2✔
233
    is_series = False
2✔
234
    if hasattr(data, "loc"):
2✔
235
        if copy:
2✔
236
            data = data.copy(deep=False)
2✔
237
        is_series = data.ndim == 1
2✔
238
    elif len(data):
2✔
239
        try:
2✔
240
            is_series = isinstance(data[0], Number)
2✔
241
        except KeyError:
2✔
242
            is_series = False
2✔
243
    if is_series:
2✔
244
        data = pd.Series(data)
2✔
245
    else:
246
        data = pd.DataFrame(data)
2✔
247
    return data
2✔
248

249

250
def from_memberships(memberships, data=None):
2✔
251
    """Load data where each sample has a collection of category names
252

253
    The output should be suitable for passing to `UpSet` or `plot`.
254

255
    Parameters
256
    ----------
257
    memberships : sequence of collections of strings
258
        Each element corresponds to a data point, indicating the sets it is a
259
        member of.  Each category is named by a string.
260
    data : Series-like or DataFrame-like, optional
261
        If given, the index of category memberships is attached to this data.
262
        It must have the same length as `memberships`.
263
        If not given, the series will contain the value 1.
264

265
    Returns
266
    -------
267
    DataFrame or Series
268
        `data` is returned with its index indicating category membership.
269
        It will be a Series if `data` is a Series or 1d numeric array.
270
        The index will have levels ordered by category names.
271

272
    Examples
273
    --------
274
    >>> from upsetplot import from_memberships
275
    >>> from_memberships([
276
    ...     ['cat1', 'cat3'],
277
    ...     ['cat2', 'cat3'],
278
    ...     ['cat1'],
279
    ...     []
280
    ... ])
281
    cat1   cat2   cat3
282
    True   False  True     1
283
    False  True   True     1
284
    True   False  False    1
285
    False  False  False    1
286
    Name: ones, dtype: ...
287
    >>> # now with data:
288
    >>> import numpy as np
289
    >>> from_memberships([
290
    ...     ['cat1', 'cat3'],
291
    ...     ['cat2', 'cat3'],
292
    ...     ['cat1'],
293
    ...     []
294
    ... ], data=np.arange(12).reshape(4, 3))
295
                       0   1   2
296
    cat1  cat2  cat3
297
    True  False True   0   1   2
298
    False True  True   3   4   5
299
    True  False False  6   7   8
300
    False False False  9  10  11
301
    """
302
    df = pd.DataFrame([{name: True for name in names} for names in memberships])
2✔
303
    for set_name in df.columns:
2✔
304
        if not hasattr(set_name, "lower"):
2✔
305
            raise ValueError("Category names should be strings")
2✔
306
    if df.shape[1] == 0:
2✔
307
        raise ValueError("Require at least one category. None were found.")
2✔
308
    df.sort_index(axis=1, inplace=True)
2✔
309
    df.fillna(False, inplace=True)
2✔
310
    df = df.astype(bool)
2✔
311
    df.set_index(list(df.columns), inplace=True)
2✔
312
    if data is None:
2✔
313
        return df.assign(ones=1)["ones"]
2✔
314

315
    data = _convert_to_pandas(data)
2✔
316
    if len(data) != len(df):
2✔
317
        raise ValueError(
2✔
318
            "memberships and data must have the same length. "
2✔
319
            "Got len(memberships) == %d, len(data) == %d"
320
            % (len(memberships), len(data))
2✔
321
        )
322
    data.index = df.index
2✔
323
    return data
2✔
324

325

326
def from_contents(contents, data=None, id_column="id"):
2✔
327
    """Build data from category listings
328

329
    Parameters
330
    ----------
331
    contents : Mapping (or iterable over pairs) of strings to sets
332
        Keys are category names, values are sets of identifiers (int or
333
        string).
334
    data : DataFrame, optional
335
        If provided, this should be indexed by the identifiers used in
336
        `contents`.
337
    id_column : str, default='id'
338
        The column name to use for the identifiers in the output.
339

340
    Returns
341
    -------
342
    DataFrame
343
        `data` is returned with its index indicating category membership,
344
        including a column named according to id_column.
345
        If data is not given, the order of rows is not assured.
346

347
    Notes
348
    -----
349
    The order of categories in the output DataFrame is determined from
350
    `contents`, which may have non-deterministic iteration order.
351

352
    Examples
353
    --------
354
    >>> from upsetplot import from_contents
355
    >>> contents = {'cat1': ['a', 'b', 'c'],
356
    ...             'cat2': ['b', 'd'],
357
    ...             'cat3': ['e']}
358
    >>> from_contents(contents)
359
                      id
360
    cat1  cat2  cat3
361
    True  False False  a
362
          True  False  b
363
          False False  c
364
    False True  False  d
365
          False True   e
366
    >>> import pandas as pd
367
    >>> contents = {'cat1': [0, 1, 2],
368
    ...             'cat2': [1, 3],
369
    ...             'cat3': [4]}
370
    >>> data = pd.DataFrame({'favourite': ['green', 'red', 'red',
371
    ...                                    'yellow', 'blue']})
372
    >>> from_contents(contents, data=data)
373
                       id favourite
374
    cat1  cat2  cat3
375
    True  False False   0     green
376
          True  False   1       red
377
          False False   2       red
378
    False True  False   3    yellow
379
          False True    4      blue
380
    """
381
    cat_series = [
1✔
382
        pd.Series(True, index=list(elements), name=name)
2✔
383
        for name, elements in contents.items()
2✔
384
    ]
385
    if not all(s.index.is_unique for s in cat_series):
2✔
386
        raise ValueError("Got duplicate ids in a category")
2✔
387

388
    concat = pd.concat
2✔
389
    if LooseVersion(pd.__version__) >= "0.23.0":
2✔
390
        # silence the warning
391
        concat = functools.partial(concat, sort=False)
2✔
392

393
    df = concat(cat_series, axis=1)
2✔
394
    if id_column in df.columns:
2✔
395
        raise ValueError("A category cannot be named %r" % id_column)
2✔
396
    df.fillna(False, inplace=True)
2✔
397
    cat_names = list(df.columns)
2✔
398

399
    if data is not None:
2✔
400
        if set(df.columns).intersection(data.columns):
2✔
401
            raise ValueError("Data columns overlap with category names")
2✔
402
        if id_column in data.columns:
2✔
403
            raise ValueError("data cannot contain a column named %r" % id_column)
2✔
404
        not_in_data = df.drop(data.index, axis=0, errors="ignore")
2✔
405
        if len(not_in_data):
2✔
406
            raise ValueError(
2✔
407
                "Found identifiers in contents that are not in "
2✔
408
                "data: %r" % not_in_data.index.values
2✔
409
            )
410
        df = df.reindex(index=data.index).fillna(False)
2✔
411
        df = concat([data, df], axis=1)
2✔
412
    df.index.name = id_column
2✔
413
    return df.reset_index().set_index(cat_names)
2✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc