• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jnothman / UpSetPlot / 7347756297

28 Dec 2023 12:48PM UTC coverage: 99.196% (-0.003%) from 99.199%
7347756297

Pull #250

github

web-flow
Merge branch 'master' into looseversion
Pull Request #250: Remove use of LooseVersion and hide get_renderer warning

1727 of 1741 relevant lines covered (99.2%)

1.95 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.58
/upsetplot/data.py
1
from __future__ import print_function, division, absolute_import
2✔
2
from numbers import Number
2✔
3
import warnings
2✔
4

2✔
5
import pandas as pd
2✔
6
import numpy as np
2✔
7

8

9
def generate_samples(seed=0, n_samples=10000, n_categories=3):
2✔
10
    """Generate artificial samples assigned to set intersections
11

12
    Parameters
13
    ----------
14
    seed : int
15
        A seed for randomisation
16
    n_samples : int
17
        Number of samples to generate
18
    n_categories : int
19
        Number of categories (named "cat0", "cat1", ...) to generate
20

21
    Returns
22
    -------
23
    DataFrame
24
        Field 'value' is a weight or score for each element.
25
        Field 'index' is a unique id for each element.
26
        Index includes a boolean indicator mask for each category.
27

28
        Note: Further fields may be added in future versions.
29

30
    See Also
31
    --------
32
    generate_counts : Generates the counts for each subset of categories
33
        corresponding to these samples.
34
    """
35
    rng = np.random.RandomState(seed)
2✔
36
    df = pd.DataFrame({"value": np.zeros(n_samples)})
2✔
37
    for i in range(n_categories):
2✔
38
        r = rng.rand(n_samples)
2✔
39
        df["cat%d" % i] = r > rng.rand()
2✔
40
        df["value"] += r
2✔
41

42
    df.reset_index(inplace=True)
2✔
43
    df.set_index(["cat%d" % i for i in range(n_categories)], inplace=True)
2✔
44
    return df
2✔
45

46

47
def generate_counts(seed=0, n_samples=10000, n_categories=3):
2✔
48
    """Generate artificial counts corresponding to set intersections
49

50
    Parameters
51
    ----------
52
    seed : int
53
        A seed for randomisation
54
    n_samples : int
55
        Number of samples to generate statistics over
56
    n_categories : int
57
        Number of categories (named "cat0", "cat1", ...) to generate
58

59
    Returns
60
    -------
61
    Series
62
        Counts indexed by boolean indicator mask for each category.
63

64
    See Also
65
    --------
66
    generate_samples : Generates a DataFrame of samples that these counts are
67
        derived from.
68
    """
69
    df = generate_samples(seed=seed, n_samples=n_samples, n_categories=n_categories)
2✔
70
    return df.value.groupby(level=list(range(n_categories))).count()
2✔
71

72

73
def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False):
2✔
74
    warnings.warn(
2✔
75
        "generate_data was replaced by generate_counts in version "
2✔
76
        "0.3 and will be removed in version 0.4.",
77
        DeprecationWarning,
2✔
78
    )
79
    if aggregated:
2✔
80
        return generate_counts(seed=seed, n_samples=n_samples, n_categories=n_sets)
×
81
    else:
×
82
        return generate_samples(seed=seed, n_samples=n_samples, n_categories=n_sets)[
2✔
83
            "value"
2✔
84
        ]
85

86

87
def from_indicators(indicators, data=None):
2✔
88
    """Load category membership indicated by a boolean indicator matrix
89

90
    This loader also supports the case where the indicator columns can be
91
    derived from `data`.
92

93
    .. versionadded:: 0.6
94

95
    Parameters
96
    ----------
97
    indicators : DataFrame-like of booleans, Sequence of str, or callable
98
        Specifies the category indicators (boolean mask arrays) within
99
        ``data``, i.e. which records in ``data`` belong to which categories.
100

101
        If a list of strings, these should be column names found in ``data``
102
        whose values are boolean mask arrays.
103

104
        If a DataFrame, its columns should correspond to categories, and its
105
        index should be a subset of those in ``data``, values should be True
106
        where a data record is in that category, and False or NA otherwise.
107

108
        If callable, it will be applied to ``data`` after the latter is
109
        converted to a Series or DataFrame.
110

111
    data : Series-like or DataFrame-like, optional
112
        If given, the index of category membership is attached to this data.
113
        It must have the same length as `indicators`.
114
        If not given, the series will contain the value 1.
115

116
    Returns
117
    -------
118
    DataFrame or Series
119
        `data` is returned with its index indicating category membership.
120
        It will be a Series if `data` is a Series or 1d numeric array or None.
121

122
    Notes
123
    -----
124
    Categories with indicators that are all False will be removed.
125

126
    Examples
127
    --------
128
    >>> import pandas as pd
129
    >>> from upsetplot import from_indicators
130
    >>>
131
    >>> # Just indicators:
132
    >>> indicators = {"cat1": [True, False, True, False],
133
    ...               "cat2": [False, True, False, False],
134
    ...               "cat3": [True, True, False, False]}
135
    >>> from_indicators(indicators)
136
    cat1   cat2   cat3
137
    True   False  True     1.0
138
    False  True   True     1.0
139
    True   False  False    1.0
140
    False  False  False    1.0
141
    Name: ones, dtype: float64
142
    >>>
143
    >>> # Where indicators are included within data, specifying
144
    >>> # columns by name:
145
    >>> data = pd.DataFrame({"value": [5, 4, 6, 4], **indicators})
146
    >>> from_indicators(["cat1", "cat3"], data=data)
147
                 value   cat1   cat2   cat3
148
    cat1  cat3
149
    True  True       5   True  False   True
150
    False True       4  False   True   True
151
    True  False      6   True  False  False
152
    False False      4  False  False  False
153
    >>>
154
    >>> # Making indicators out of all boolean columns:
155
    >>> from_indicators(lambda data: data.select_dtypes(bool), data=data)
156
                       value   cat1   cat2   cat3
157
    cat1  cat2  cat3
158
    True  False True       5   True  False   True
159
    False True  True       4  False   True   True
160
    True  False False      6   True  False  False
161
    False False False      4  False  False  False
162
    >>>
163
    >>> # Using a dataset with missing data, we can use missingness as
164
    >>> # an indicator:
165
    >>> data = pd.DataFrame({"val1": [pd.NA, .7, pd.NA, .9],
166
    ...                      "val2": ["male", pd.NA, "female", "female"],
167
    ...                      "val3": [pd.NA, pd.NA, 23000, 78000]})
168
    >>> from_indicators(pd.isna, data=data)
169
                       val1    val2   val3
170
    val1  val2  val3
171
    True  False True   <NA>    male   <NA>
172
    False True  True    0.7    <NA>   <NA>
173
    True  False False  <NA>  female  23000
174
    False False False   0.9  female  78000
175
    """
176
    if data is not None:
2✔
177
        data = _convert_to_pandas(data)
2✔
178

179
    if callable(indicators):
2✔
180
        if data is None:
2✔
181
            raise ValueError("data must be provided when indicators is " "callable")
2✔
182
        indicators = indicators(data)
2✔
183

184
    try:
2✔
185
        indicators[0]
2✔
186
    except Exception:
2✔
187
        pass
2✔
188
    else:
×
189
        if isinstance(indicators[0], (str, int)):
2✔
190
            if data is None:
2✔
191
                raise ValueError(
2✔
192
                    "data must be provided when indicators are "
2✔
193
                    "specified as a list of columns"
194
                )
195
            if isinstance(indicators, tuple):
2✔
196
                raise ValueError("indicators as tuple is not supported")
2✔
197
            # column array
198
            indicators = data[indicators]
2✔
199

200
    indicators = pd.DataFrame(indicators).fillna(False).infer_objects()
2✔
201
    # drop all-False (should we be dropping all-True also? making an option?)
202
    indicators = indicators.loc[:, indicators.any(axis=0)]
2✔
203

204
    if not all(dtype.kind == "b" for dtype in indicators.dtypes):
2✔
205
        raise ValueError("The indicators must all be boolean")
2✔
206

207
    if data is not None:
2✔
208
        if not (
2✔
209
            isinstance(indicators.index, pd.RangeIndex)
2✔
210
            and indicators.index[0] == 0
2✔
211
            and indicators.index[-1] == len(data) - 1
2✔
212
        ):
213
            # index is specified on indicators. Need to align it to data
214
            if not indicators.index.isin(data.index).all():
2✔
215
                raise ValueError(
2✔
216
                    "If indicators.index is not the default, "
2✔
217
                    "all its values must be present in "
218
                    "data.index"
219
                )
220
            indicators = indicators.reindex(index=data.index, fill_value=False)
2✔
221
    else:
222
        data = pd.Series(np.ones(len(indicators)), name="ones")
2✔
223

224
    indicators.set_index(list(indicators.columns), inplace=True)
2✔
225
    data.index = indicators.index
2✔
226

227
    return data
2✔
228

229

230
def _convert_to_pandas(data, copy=True):
2✔
231
    is_series = False
2✔
232
    if hasattr(data, "loc"):
2✔
233
        if copy:
2✔
234
            data = data.copy(deep=False)
2✔
235
        is_series = data.ndim == 1
2✔
236
    elif len(data):
2✔
237
        try:
2✔
238
            is_series = isinstance(data[0], Number)
2✔
239
        except KeyError:
2✔
240
            is_series = False
2✔
241
    if is_series:
2✔
242
        data = pd.Series(data)
2✔
243
    else:
244
        data = pd.DataFrame(data)
2✔
245
    return data
2✔
246

247

248
def from_memberships(memberships, data=None):
2✔
249
    """Load data where each sample has a collection of category names
250

251
    The output should be suitable for passing to `UpSet` or `plot`.
252

253
    Parameters
254
    ----------
255
    memberships : sequence of collections of strings
256
        Each element corresponds to a data point, indicating the sets it is a
257
        member of.  Each category is named by a string.
258
    data : Series-like or DataFrame-like, optional
259
        If given, the index of category memberships is attached to this data.
260
        It must have the same length as `memberships`.
261
        If not given, the series will contain the value 1.
262

263
    Returns
264
    -------
265
    DataFrame or Series
266
        `data` is returned with its index indicating category membership.
267
        It will be a Series if `data` is a Series or 1d numeric array.
268
        The index will have levels ordered by category names.
269

270
    Examples
271
    --------
272
    >>> from upsetplot import from_memberships
273
    >>> from_memberships([
274
    ...     ['cat1', 'cat3'],
275
    ...     ['cat2', 'cat3'],
276
    ...     ['cat1'],
277
    ...     []
278
    ... ])
279
    cat1   cat2   cat3
280
    True   False  True     1
281
    False  True   True     1
282
    True   False  False    1
283
    False  False  False    1
284
    Name: ones, dtype: ...
285
    >>> # now with data:
286
    >>> import numpy as np
287
    >>> from_memberships([
288
    ...     ['cat1', 'cat3'],
289
    ...     ['cat2', 'cat3'],
290
    ...     ['cat1'],
291
    ...     []
292
    ... ], data=np.arange(12).reshape(4, 3))
293
                       0   1   2
294
    cat1  cat2  cat3
295
    True  False True   0   1   2
296
    False True  True   3   4   5
297
    True  False False  6   7   8
298
    False False False  9  10  11
299
    """
300
    df = pd.DataFrame([{name: True for name in names} for names in memberships])
2✔
301
    for set_name in df.columns:
2✔
302
        if not hasattr(set_name, "lower"):
2✔
303
            raise ValueError("Category names should be strings")
2✔
304
    if df.shape[1] == 0:
2✔
305
        raise ValueError("Require at least one category. None were found.")
2✔
306
    df.sort_index(axis=1, inplace=True)
2✔
307
    df.fillna(False, inplace=True)
2✔
308
    df = df.astype(bool)
2✔
309
    df.set_index(list(df.columns), inplace=True)
2✔
310
    if data is None:
2✔
311
        return df.assign(ones=1)["ones"]
2✔
312

313
    data = _convert_to_pandas(data)
2✔
314
    if len(data) != len(df):
2✔
315
        raise ValueError(
2✔
316
            "memberships and data must have the same length. "
2✔
317
            "Got len(memberships) == %d, len(data) == %d"
318
            % (len(memberships), len(data))
2✔
319
        )
320
    data.index = df.index
2✔
321
    return data
2✔
322

323

324
def from_contents(contents, data=None, id_column="id"):
2✔
325
    """Build data from category listings
326

327
    Parameters
328
    ----------
329
    contents : Mapping (or iterable over pairs) of strings to sets
330
        Keys are category names, values are sets of identifiers (int or
331
        string).
332
    data : DataFrame, optional
333
        If provided, this should be indexed by the identifiers used in
334
        `contents`.
335
    id_column : str, default='id'
336
        The column name to use for the identifiers in the output.
337

338
    Returns
339
    -------
340
    DataFrame
341
        `data` is returned with its index indicating category membership,
342
        including a column named according to id_column.
343
        If data is not given, the order of rows is not assured.
344

345
    Notes
346
    -----
347
    The order of categories in the output DataFrame is determined from
348
    `contents`, which may have non-deterministic iteration order.
349

350
    Examples
351
    --------
352
    >>> from upsetplot import from_contents
353
    >>> contents = {'cat1': ['a', 'b', 'c'],
354
    ...             'cat2': ['b', 'd'],
355
    ...             'cat3': ['e']}
356
    >>> from_contents(contents)
357
                      id
358
    cat1  cat2  cat3
359
    True  False False  a
360
          True  False  b
361
          False False  c
362
    False True  False  d
363
          False True   e
364
    >>> import pandas as pd
365
    >>> contents = {'cat1': [0, 1, 2],
366
    ...             'cat2': [1, 3],
367
    ...             'cat3': [4]}
368
    >>> data = pd.DataFrame({'favourite': ['green', 'red', 'red',
369
    ...                                    'yellow', 'blue']})
370
    >>> from_contents(contents, data=data)
371
                       id favourite
372
    cat1  cat2  cat3
373
    True  False False   0     green
374
          True  False   1       red
375
          False False   2       red
376
    False True  False   3    yellow
377
          False True    4      blue
378
    """
379
    cat_series = [
2✔
380
        pd.Series(True, index=list(elements), name=name)
2✔
381
        for name, elements in contents.items()
2✔
382
    ]
383
    if not all(s.index.is_unique for s in cat_series):
2✔
384
        raise ValueError("Got duplicate ids in a category")
2✔
385

386
    df = pd.concat(cat_series, axis=1, sort=False)
2✔
387
    if id_column in df.columns:
2✔
388
        raise ValueError("A category cannot be named %r" % id_column)
2✔
389
    df.fillna(False, inplace=True)
2✔
390
    cat_names = list(df.columns)
2✔
391

392
    if data is not None:
2✔
393
        if set(df.columns).intersection(data.columns):
2✔
394
            raise ValueError("Data columns overlap with category names")
2✔
395
        if id_column in data.columns:
2✔
396
            raise ValueError("data cannot contain a column named %r" % id_column)
2✔
397
        not_in_data = df.drop(data.index, axis=0, errors="ignore")
2✔
398
        if len(not_in_data):
2✔
399
            raise ValueError(
2✔
400
                "Found identifiers in contents that are not in "
2✔
401
                "data: %r" % not_in_data.index.values
2✔
402
            )
403
        df = df.reindex(index=data.index).fillna(False)
2✔
404
        df = pd.concat([data, df], axis=1, sort=False)
2✔
405
    df.index.name = id_column
2✔
406
    return df.reset_index().set_index(cat_names)
2✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc