• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jnothman / UpSetPlot / 7342943552

28 Dec 2023 12:13AM UTC coverage: 83.549% (-14.0%) from 97.551%
7342943552

push

github

web-flow
Fix warning due to styling dtyles, and fix column dtype test failure (#238)


Fixes #225

6 of 6 new or added lines in 2 files covered. (100.0%)

312 existing lines in 7 files now uncovered.

1681 of 2012 relevant lines covered (83.55%)

1.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.04
/upsetplot/data.py
1
from __future__ import print_function, division, absolute_import
2✔
2
from numbers import Number
2✔
3
import functools
2✔
4
from distutils.version import LooseVersion
2✔
5
import warnings
2✔
6

2✔
7
import pandas as pd
2✔
8
import numpy as np
2✔
9

10

11
def generate_samples(seed=0, n_samples=10000, n_categories=3):
2✔
12
    """Generate artificial samples assigned to set intersections
13

14
    Parameters
15
    ----------
16
    seed : int
17
        A seed for randomisation
18
    n_samples : int
19
        Number of samples to generate
20
    n_categories : int
21
        Number of categories (named "cat0", "cat1", ...) to generate
22

23
    Returns
24
    -------
25
    DataFrame
26
        Field 'value' is a weight or score for each element.
27
        Field 'index' is a unique id for each element.
28
        Index includes a boolean indicator mask for each category.
29

30
        Note: Further fields may be added in future versions.
31

32
    See Also
33
    --------
34
    generate_counts : Generates the counts for each subset of categories
35
        corresponding to these samples.
36
    """
37
    rng = np.random.RandomState(seed)
2✔
38
    df = pd.DataFrame({'value': np.zeros(n_samples)})
2✔
39
    for i in range(n_categories):
2✔
40
        r = rng.rand(n_samples)
2✔
41
        df['cat%d' % i] = r > rng.rand()
2✔
42
        df['value'] += r
2✔
43

44
    df.reset_index(inplace=True)
2✔
45
    df.set_index(['cat%d' % i for i in range(n_categories)], inplace=True)
2✔
46
    return df
2✔
47

48

49
def generate_counts(seed=0, n_samples=10000, n_categories=3):
2✔
50
    """Generate artificial counts corresponding to set intersections
51

52
    Parameters
53
    ----------
54
    seed : int
55
        A seed for randomisation
56
    n_samples : int
57
        Number of samples to generate statistics over
58
    n_categories : int
59
        Number of categories (named "cat0", "cat1", ...) to generate
60

61
    Returns
62
    -------
63
    Series
64
        Counts indexed by boolean indicator mask for each category.
65

66
    See Also
67
    --------
68
    generate_samples : Generates a DataFrame of samples that these counts are
69
        derived from.
70
    """
71
    df = generate_samples(seed=seed, n_samples=n_samples,
2✔
72
                          n_categories=n_categories)
2✔
73
    return df.value.groupby(level=list(range(n_categories))).count()
2✔
74

75

76
def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False):
2✔
77
    warnings.warn('generate_data was replaced by generate_counts in version '
2✔
UNCOV
78
                  '0.3 and will be removed in version 0.4.',
×
79
                  DeprecationWarning)
2✔
80
    if aggregated:
2✔
81
        return generate_counts(seed=seed, n_samples=n_samples,
×
UNCOV
82
                               n_categories=n_sets)
×
UNCOV
83
    else:
×
84
        return generate_samples(seed=seed, n_samples=n_samples,
2✔
85
                                n_categories=n_sets)['value']
2✔
86

87

88
def from_indicators(indicators, data=None):
2✔
89
    """Load category membership indicated by a boolean indicator matrix
90

91
    This loader also supports the case where the indicator columns can be
92
    derived from `data`.
93

94
    .. versionadded:: 0.6
95

96
    Parameters
97
    ----------
98
    indicators : DataFrame-like of booleans, Sequence of str, or callable
99
        Specifies the category indicators (boolean mask arrays) within
100
        ``data``, i.e. which records in ``data`` belong to which categories.
101

102
        If a list of strings, these should be column names found in ``data``
103
        whose values are boolean mask arrays.
104

105
        If a DataFrame, its columns should correspond to categories, and its
106
        index should be a subset of those in ``data``, values should be True
107
        where a data record is in that category, and False or NA otherwise.
108

109
        If callable, it will be applied to ``data`` after the latter is
110
        converted to a Series or DataFrame.
111

112
    data : Series-like or DataFrame-like, optional
113
        If given, the index of category membership is attached to this data.
114
        It must have the same length as `indicators`.
115
        If not given, the series will contain the value 1.
116

117
    Returns
118
    -------
119
    DataFrame or Series
120
        `data` is returned with its index indicating category membership.
121
        It will be a Series if `data` is a Series or 1d numeric array or None.
122

123
    Notes
124
    -----
125
    Categories with indicators that are all False will be removed.
126

127
    Examples
128
    --------
129
    >>> import pandas as pd
130
    >>> from upsetplot import from_indicators
131
    >>>
132
    >>> # Just indicators:
133
    >>> indicators = {"cat1": [True, False, True, False],
134
    ...               "cat2": [False, True, False, False],
135
    ...               "cat3": [True, True, False, False]}
136
    >>> from_indicators(indicators)
137
    cat1   cat2   cat3
138
    True   False  True     1.0
139
    False  True   True     1.0
140
    True   False  False    1.0
141
    False  False  False    1.0
142
    Name: ones, dtype: float64
143
    >>>
144
    >>> # Where indicators are included within data, specifying
145
    >>> # columns by name:
146
    >>> data = pd.DataFrame({"value": [5, 4, 6, 4], **indicators})
147
    >>> from_indicators(["cat1", "cat3"], data=data)
148
                 value   cat1   cat2   cat3
149
    cat1  cat3
150
    True  True       5   True  False   True
151
    False True       4  False   True   True
152
    True  False      6   True  False  False
153
    False False      4  False  False  False
154
    >>>
155
    >>> # Making indicators out of all boolean columns:
156
    >>> from_indicators(lambda data: data.select_dtypes(bool), data=data)
157
                       value   cat1   cat2   cat3
158
    cat1  cat2  cat3
159
    True  False True       5   True  False   True
160
    False True  True       4  False   True   True
161
    True  False False      6   True  False  False
162
    False False False      4  False  False  False
163
    >>>
164
    >>> # Using a dataset with missing data, we can use missingness as
165
    >>> # an indicator:
166
    >>> data = pd.DataFrame({"val1": [pd.NA, .7, pd.NA, .9],
167
    ...                      "val2": ["male", pd.NA, "female", "female"],
168
    ...                      "val3": [pd.NA, pd.NA, 23000, 78000]})
169
    >>> from_indicators(pd.isna, data=data)
170
                       val1    val2   val3
171
    val1  val2  val3
172
    True  False True   <NA>    male   <NA>
173
    False True  True    0.7    <NA>   <NA>
174
    True  False False  <NA>  female  23000
175
    False False False   0.9  female  78000
176
    """
177
    if data is not None:
2✔
178
        data = _convert_to_pandas(data)
2✔
179

180
    if callable(indicators):
2✔
181
        if data is None:
2✔
182
            raise ValueError("data must be provided when indicators is "
2✔
UNCOV
183
                             "callable")
×
184
        indicators = indicators(data)
2✔
185

186
    try:
2✔
187
        indicators[0]
2✔
188
    except Exception:
2✔
189
        pass
2✔
UNCOV
190
    else:
×
191
        if isinstance(indicators[0], (str, int)):
2✔
192
            if data is None:
2✔
193
                raise ValueError("data must be provided when indicators are "
2✔
UNCOV
194
                                 "specified as a list of columns")
×
195
            if isinstance(indicators, tuple):
2✔
196
                raise ValueError("indicators as tuple is not supported")
2✔
197
            # column array
198
            indicators = data[indicators]
2✔
199

200
    indicators = pd.DataFrame(indicators).fillna(False).infer_objects()
2✔
201
    # drop all-False (should we be dropping all-True also? making an option?)
202
    indicators = indicators.loc[:, indicators.any(axis=0)]
2✔
203

204
    if not all(dtype.kind == 'b' for dtype in indicators.dtypes):
2✔
205
        raise ValueError('The indicators must all be boolean')
2✔
206

207
    if data is not None:
2✔
208
        if not (isinstance(indicators.index, pd.RangeIndex)
2✔
209
                and indicators.index[0] == 0
2✔
210
                and indicators.index[-1] == len(data) - 1):
2✔
211
            # index is specified on indicators. Need to align it to data
212
            if not indicators.index.isin(data.index).all():
2✔
213
                raise ValueError("If indicators.index is not the default, "
2✔
UNCOV
214
                                 "all its values must be present in "
×
UNCOV
215
                                 "data.index")
×
216
            indicators = indicators.reindex(index=data.index, fill_value=False)
2✔
UNCOV
217
    else:
×
218
        data = pd.Series(np.ones(len(indicators)), name="ones")
2✔
219

220
    indicators.set_index(list(indicators.columns), inplace=True)
2✔
221
    data.index = indicators.index
2✔
222

223
    return data
2✔
224

225

226
def _convert_to_pandas(data, copy=True):
2✔
227
    is_series = False
2✔
228
    if hasattr(data, 'loc'):
2✔
229
        if copy:
2✔
230
            data = data.copy(deep=False)
2✔
231
        is_series = data.ndim == 1
2✔
232
    elif len(data):
2✔
233
        try:
2✔
234
            is_series = isinstance(data[0], Number)
2✔
235
        except KeyError:
2✔
236
            is_series = False
2✔
237
    if is_series:
2✔
238
        data = pd.Series(data)
2✔
UNCOV
239
    else:
×
240
        data = pd.DataFrame(data)
2✔
241
    return data
2✔
242

243

244
def from_memberships(memberships, data=None):
2✔
245
    """Load data where each sample has a collection of category names
246

247
    The output should be suitable for passing to `UpSet` or `plot`.
248

249
    Parameters
250
    ----------
251
    memberships : sequence of collections of strings
252
        Each element corresponds to a data point, indicating the sets it is a
253
        member of.  Each category is named by a string.
254
    data : Series-like or DataFrame-like, optional
255
        If given, the index of category memberships is attached to this data.
256
        It must have the same length as `memberships`.
257
        If not given, the series will contain the value 1.
258

259
    Returns
260
    -------
261
    DataFrame or Series
262
        `data` is returned with its index indicating category membership.
263
        It will be a Series if `data` is a Series or 1d numeric array.
264
        The index will have levels ordered by category names.
265

266
    Examples
267
    --------
268
    >>> from upsetplot import from_memberships
269
    >>> from_memberships([
270
    ...     ['cat1', 'cat3'],
271
    ...     ['cat2', 'cat3'],
272
    ...     ['cat1'],
273
    ...     []
274
    ... ])
275
    cat1   cat2   cat3
276
    True   False  True     1
277
    False  True   True     1
278
    True   False  False    1
279
    False  False  False    1
280
    Name: ones, dtype: ...
281
    >>> # now with data:
282
    >>> import numpy as np
283
    >>> from_memberships([
284
    ...     ['cat1', 'cat3'],
285
    ...     ['cat2', 'cat3'],
286
    ...     ['cat1'],
287
    ...     []
288
    ... ], data=np.arange(12).reshape(4, 3))
289
                       0   1   2
290
    cat1  cat2  cat3
291
    True  False True   0   1   2
292
    False True  True   3   4   5
293
    True  False False  6   7   8
294
    False False False  9  10  11
295
    """
296
    df = pd.DataFrame([{name: True for name in names}
2✔
297
                       for names in memberships])
2✔
298
    for set_name in df.columns:
2✔
299
        if not hasattr(set_name, 'lower'):
2✔
300
            raise ValueError('Category names should be strings')
2✔
301
    if df.shape[1] == 0:
2✔
302
        raise ValueError('Require at least one category. None were found.')
2✔
303
    df.sort_index(axis=1, inplace=True)
2✔
304
    df.fillna(False, inplace=True)
2✔
305
    df = df.astype(bool)
2✔
306
    df.set_index(list(df.columns), inplace=True)
2✔
307
    if data is None:
2✔
308
        return df.assign(ones=1)['ones']
2✔
309

310
    data = _convert_to_pandas(data)
2✔
311
    if len(data) != len(df):
2✔
312
        raise ValueError('memberships and data must have the same length. '
2✔
UNCOV
313
                         'Got len(memberships) == %d, len(data) == %d'
×
314
                         % (len(memberships), len(data)))
2✔
315
    data.index = df.index
2✔
316
    return data
2✔
317

318

319
def from_contents(contents, data=None, id_column='id'):
2✔
320
    """Build data from category listings
321

322
    Parameters
323
    ----------
324
    contents : Mapping (or iterable over pairs) of strings to sets
325
        Keys are category names, values are sets of identifiers (int or
326
        string).
327
    data : DataFrame, optional
328
        If provided, this should be indexed by the identifiers used in
329
        `contents`.
330
    id_column : str, default='id'
331
        The column name to use for the identifiers in the output.
332

333
    Returns
334
    -------
335
    DataFrame
336
        `data` is returned with its index indicating category membership,
337
        including a column named according to id_column.
338
        If data is not given, the order of rows is not assured.
339

340
    Notes
341
    -----
342
    The order of categories in the output DataFrame is determined from
343
    `contents`, which may have non-deterministic iteration order.
344

345
    Examples
346
    --------
347
    >>> from upsetplot import from_contents
348
    >>> contents = {'cat1': ['a', 'b', 'c'],
349
    ...             'cat2': ['b', 'd'],
350
    ...             'cat3': ['e']}
351
    >>> from_contents(contents)
352
                      id
353
    cat1  cat2  cat3
354
    True  False False  a
355
          True  False  b
356
          False False  c
357
    False True  False  d
358
          False True   e
359
    >>> import pandas as pd
360
    >>> contents = {'cat1': [0, 1, 2],
361
    ...             'cat2': [1, 3],
362
    ...             'cat3': [4]}
363
    >>> data = pd.DataFrame({'favourite': ['green', 'red', 'red',
364
    ...                                    'yellow', 'blue']})
365
    >>> from_contents(contents, data=data)
366
                       id favourite
367
    cat1  cat2  cat3
368
    True  False False   0     green
369
          True  False   1       red
370
          False False   2       red
371
    False True  False   3    yellow
372
          False True    4      blue
373
    """
374
    cat_series = [pd.Series(True, index=list(elements), name=name)
2✔
375
                  for name, elements in contents.items()]
2✔
376
    if not all(s.index.is_unique for s in cat_series):
2✔
377
        raise ValueError('Got duplicate ids in a category')
2✔
378

379
    concat = pd.concat
2✔
380
    if LooseVersion(pd.__version__) >= '0.23.0':
2✔
381
        # silence the warning
382
        concat = functools.partial(concat, sort=False)
2✔
383

384
    df = concat(cat_series, axis=1)
2✔
385
    if id_column in df.columns:
2✔
386
        raise ValueError('A category cannot be named %r' % id_column)
2✔
387
    df.fillna(False, inplace=True)
2✔
388
    cat_names = list(df.columns)
2✔
389

390
    if data is not None:
2✔
391
        if set(df.columns).intersection(data.columns):
2✔
392
            raise ValueError('Data columns overlap with category names')
2✔
393
        if id_column in data.columns:
2✔
394
            raise ValueError('data cannot contain a column named %r' %
2✔
395
                             id_column)
2✔
396
        not_in_data = df.drop(data.index, axis=0, errors='ignore')
2✔
397
        if len(not_in_data):
2✔
398
            raise ValueError('Found identifiers in contents that are not in '
2✔
399
                             'data: %r' % not_in_data.index.values)
2✔
400
        df = df.reindex(index=data.index).fillna(False)
2✔
401
        df = concat([data, df], axis=1)
2✔
402
    df.index.name = id_column
2✔
403
    return df.reset_index().set_index(cat_names)
2✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc