7342943552

Committed 28 Dec 2023 12:13AM UTC coverage: 83.549% (-14.0%) from 97.551%

Build # 7342943552

Build Type

push

github

Committed by

web-flow

Commit Message

Fix warning due to styling dtyles, and fix column dtype test failure (#238)


Fixes #225

Run Details

6 of 6 new or added lines in 2 files covered. (100.0%)

312 existing lines in 7 files now uncovered.

1681 of 2012 relevant lines covered (83.55%)

1.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.04

/upsetplot/data.py

from __future__ import print_function, division, absolute_import
from numbers import Number
import functools
from distutils.version import LooseVersion
import warnings

import pandas as pd
import numpy as np


def generate_samples(seed=0, n_samples=10000, n_categories=3):
    """Generate artificial samples assigned to set intersections

    Parameters
    ----------
    seed : int
        A seed for randomisation
    n_samples : int
        Number of samples to generate
    n_categories : int
        Number of categories (named "cat0", "cat1", ...) to generate

    Returns
    -------
    DataFrame
        Field 'value' is a weight or score for each element.
        Field 'index' is a unique id for each element.
        Index includes a boolean indicator mask for each category.

        Note: Further fields may be added in future versions.

    See Also
    --------
    generate_counts : Generates the counts for each subset of categories
        corresponding to these samples.
    """
    rng = np.random.RandomState(seed)
    df = pd.DataFrame({'value': np.zeros(n_samples)})
    for i in range(n_categories):
        r = rng.rand(n_samples)
        df['cat%d' % i] = r > rng.rand()
        df['value'] += r

    df.reset_index(inplace=True)
    df.set_index(['cat%d' % i for i in range(n_categories)], inplace=True)
    return df


def generate_counts(seed=0, n_samples=10000, n_categories=3):
    """Generate artificial counts corresponding to set intersections

    Parameters
    ----------
    seed : int
        A seed for randomisation
    n_samples : int
        Number of samples to generate statistics over
    n_categories : int
        Number of categories (named "cat0", "cat1", ...) to generate

    Returns
    -------
    Series
        Counts indexed by boolean indicator mask for each category.

    See Also
    --------
    generate_samples : Generates a DataFrame of samples that these counts are
        derived from.
    """
    df = generate_samples(seed=seed, n_samples=n_samples,
                          n_categories=n_categories)
    return df.value.groupby(level=list(range(n_categories))).count()


def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False):
    warnings.warn('generate_data was replaced by generate_counts in version '
                  '0.3 and will be removed in version 0.4.',
                  DeprecationWarning)
    if aggregated:
        return generate_counts(seed=seed, n_samples=n_samples,
                               n_categories=n_sets)
    else:
        return generate_samples(seed=seed, n_samples=n_samples,
                                n_categories=n_sets)['value']


def from_indicators(indicators, data=None):
    """Load category membership indicated by a boolean indicator matrix

    This loader also supports the case where the indicator columns can be
    derived from `data`.

    .. versionadded:: 0.6

    Parameters
    ----------
    indicators : DataFrame-like of booleans, Sequence of str, or callable
        Specifies the category indicators (boolean mask arrays) within
        ``data``, i.e. which records in ``data`` belong to which categories.

        If a list of strings, these should be column names found in ``data``
        whose values are boolean mask arrays.

        If a DataFrame, its columns should correspond to categories, and its
        index should be a subset of those in ``data``, values should be True
        where a data record is in that category, and False or NA otherwise.

        If callable, it will be applied to ``data`` after the latter is
        converted to a Series or DataFrame.

    data : Series-like or DataFrame-like, optional
        If given, the index of category membership is attached to this data.
        It must have the same length as `indicators`.
        If not given, the series will contain the value 1.

    Returns
    -------
    DataFrame or Series
        `data` is returned with its index indicating category membership.
        It will be a Series if `data` is a Series or 1d numeric array or None.

    Notes
    -----
    Categories with indicators that are all False will be removed.

    Examples
    --------
    >>> import pandas as pd
    >>> from upsetplot import from_indicators
    >>>
    >>> # Just indicators:
    >>> indicators = {"cat1": [True, False, True, False],
    ...               "cat2": [False, True, False, False],
    ...               "cat3": [True, True, False, False]}
    >>> from_indicators(indicators)
    cat1   cat2   cat3
    True   False  True     1.0
    False  True   True     1.0
    True   False  False    1.0
    False  False  False    1.0
    Name: ones, dtype: float64
    >>>
    >>> # Where indicators are included within data, specifying
    >>> # columns by name:
    >>> data = pd.DataFrame({"value": [5, 4, 6, 4], **indicators})
    >>> from_indicators(["cat1", "cat3"], data=data)
                 value   cat1   cat2   cat3
    cat1  cat3
    True  True       5   True  False   True
    False True       4  False   True   True
    True  False      6   True  False  False
    False False      4  False  False  False
    >>>
    >>> # Making indicators out of all boolean columns:
    >>> from_indicators(lambda data: data.select_dtypes(bool), data=data)
                       value   cat1   cat2   cat3
    cat1  cat2  cat3
    True  False True       5   True  False   True
    False True  True       4  False   True   True
    True  False False      6   True  False  False
    False False False      4  False  False  False
    >>>
    >>> # Using a dataset with missing data, we can use missingness as
    >>> # an indicator:
    >>> data = pd.DataFrame({"val1": [pd.NA, .7, pd.NA, .9],
    ...                      "val2": ["male", pd.NA, "female", "female"],
    ...                      "val3": [pd.NA, pd.NA, 23000, 78000]})
    >>> from_indicators(pd.isna, data=data)
                       val1    val2   val3
    val1  val2  val3
    True  False True   <NA>    male   <NA>
    False True  True    0.7    <NA>   <NA>
    True  False False  <NA>  female  23000
    False False False   0.9  female  78000
    """
    if data is not None:
        data = _convert_to_pandas(data)

    if callable(indicators):
        if data is None:
            raise ValueError("data must be provided when indicators is "
                             "callable")
        indicators = indicators(data)

    try:
        indicators[0]
    except Exception:
        pass
    else:
        if isinstance(indicators[0], (str, int)):
            if data is None:
                raise ValueError("data must be provided when indicators are "
                                 "specified as a list of columns")
            if isinstance(indicators, tuple):
                raise ValueError("indicators as tuple is not supported")
            # column array
            indicators = data[indicators]

    indicators = pd.DataFrame(indicators).fillna(False).infer_objects()
    # drop all-False (should we be dropping all-True also? making an option?)
    indicators = indicators.loc[:, indicators.any(axis=0)]

    if not all(dtype.kind == 'b' for dtype in indicators.dtypes):
        raise ValueError('The indicators must all be boolean')

    if data is not None:
        if not (isinstance(indicators.index, pd.RangeIndex)
                and indicators.index[0] == 0
                and indicators.index[-1] == len(data) - 1):
            # index is specified on indicators. Need to align it to data
            if not indicators.index.isin(data.index).all():
                raise ValueError("If indicators.index is not the default, "
                                 "all its values must be present in "
                                 "data.index")
            indicators = indicators.reindex(index=data.index, fill_value=False)
    else:
        data = pd.Series(np.ones(len(indicators)), name="ones")

    indicators.set_index(list(indicators.columns), inplace=True)
    data.index = indicators.index

    return data


def _convert_to_pandas(data, copy=True):
    is_series = False
    if hasattr(data, 'loc'):
        if copy:
            data = data.copy(deep=False)
        is_series = data.ndim == 1
    elif len(data):
        try:
            is_series = isinstance(data[0], Number)
        except KeyError:
            is_series = False
    if is_series:
        data = pd.Series(data)
    else:
        data = pd.DataFrame(data)
    return data


def from_memberships(memberships, data=None):
    """Load data where each sample has a collection of category names

    The output should be suitable for passing to `UpSet` or `plot`.

    Parameters
    ----------
    memberships : sequence of collections of strings
        Each element corresponds to a data point, indicating the sets it is a
        member of.  Each category is named by a string.
    data : Series-like or DataFrame-like, optional
        If given, the index of category memberships is attached to this data.
        It must have the same length as `memberships`.
        If not given, the series will contain the value 1.

    Returns
    -------
    DataFrame or Series
        `data` is returned with its index indicating category membership.
        It will be a Series if `data` is a Series or 1d numeric array.
        The index will have levels ordered by category names.

    Examples
    --------
    >>> from upsetplot import from_memberships
    >>> from_memberships([
    ...     ['cat1', 'cat3'],
    ...     ['cat2', 'cat3'],
    ...     ['cat1'],
    ...     []
    ... ])
    cat1   cat2   cat3
    True   False  True     1
    False  True   True     1
    True   False  False    1
    False  False  False    1
    Name: ones, dtype: ...
    >>> # now with data:
    >>> import numpy as np
    >>> from_memberships([
    ...     ['cat1', 'cat3'],
    ...     ['cat2', 'cat3'],
    ...     ['cat1'],
    ...     []
    ... ], data=np.arange(12).reshape(4, 3))
                       0   1   2
    cat1  cat2  cat3
    True  False True   0   1   2
    False True  True   3   4   5
    True  False False  6   7   8
    False False False  9  10  11
    """
    df = pd.DataFrame([{name: True for name in names}
                       for names in memberships])
    for set_name in df.columns:
        if not hasattr(set_name, 'lower'):
            raise ValueError('Category names should be strings')
    if df.shape[1] == 0:
        raise ValueError('Require at least one category. None were found.')
    df.sort_index(axis=1, inplace=True)
    df.fillna(False, inplace=True)
    df = df.astype(bool)
    df.set_index(list(df.columns), inplace=True)
    if data is None:
        return df.assign(ones=1)['ones']

    data = _convert_to_pandas(data)
    if len(data) != len(df):
        raise ValueError('memberships and data must have the same length. '
                         'Got len(memberships) == %d, len(data) == %d'
                         % (len(memberships), len(data)))
    data.index = df.index
    return data


def from_contents(contents, data=None, id_column='id'):
    """Build data from category listings

    Parameters
    ----------
    contents : Mapping (or iterable over pairs) of strings to sets
        Keys are category names, values are sets of identifiers (int or
        string).
    data : DataFrame, optional
        If provided, this should be indexed by the identifiers used in
        `contents`.
    id_column : str, default='id'
        The column name to use for the identifiers in the output.

    Returns
    -------
    DataFrame
        `data` is returned with its index indicating category membership,
        including a column named according to id_column.
        If data is not given, the order of rows is not assured.

    Notes
    -----
    The order of categories in the output DataFrame is determined from
    `contents`, which may have non-deterministic iteration order.

    Examples
    --------
    >>> from upsetplot import from_contents
    >>> contents = {'cat1': ['a', 'b', 'c'],
    ...             'cat2': ['b', 'd'],
    ...             'cat3': ['e']}
    >>> from_contents(contents)
                      id
    cat1  cat2  cat3
    True  False False  a
          True  False  b
          False False  c
    False True  False  d
          False True   e
    >>> import pandas as pd
    >>> contents = {'cat1': [0, 1, 2],
    ...             'cat2': [1, 3],
    ...             'cat3': [4]}
    >>> data = pd.DataFrame({'favourite': ['green', 'red', 'red',
    ...                                    'yellow', 'blue']})
    >>> from_contents(contents, data=data)
                       id favourite
    cat1  cat2  cat3
    True  False False   0     green
          True  False   1       red
          False False   2       red
    False True  False   3    yellow
          False True    4      blue
    """
    cat_series = [pd.Series(True, index=list(elements), name=name)
                  for name, elements in contents.items()]
    if not all(s.index.is_unique for s in cat_series):
        raise ValueError('Got duplicate ids in a category')

    concat = pd.concat
    if LooseVersion(pd.__version__) >= '0.23.0':
        # silence the warning
        concat = functools.partial(concat, sort=False)

    df = concat(cat_series, axis=1)
    if id_column in df.columns:
        raise ValueError('A category cannot be named %r' % id_column)
    df.fillna(False, inplace=True)
    cat_names = list(df.columns)

    if data is not None:
        if set(df.columns).intersection(data.columns):
            raise ValueError('Data columns overlap with category names')
        if id_column in data.columns:
            raise ValueError('data cannot contain a column named %r' %
                             id_column)
        not_in_data = df.drop(data.index, axis=0, errors='ignore')
        if len(not_in_data):
            raise ValueError('Found identifiers in contents that are not in '
                             'data: %r' % not_in_data.index.values)
        df = df.reindex(index=data.index).fillna(False)
        df = concat([data, df], axis=1)
    df.index.name = id_column
    return df.reset_index().set_index(cat_names)

1	from __future__ import print_function, division, absolute_import	2✔
2	from numbers import Number	2✔
3	import functools	2✔
4	from distutils.version import LooseVersion	2✔
5	import warnings	2✔
6		2✔
7	import pandas as pd	2✔
8	import numpy as np	2✔
9
10
11	def generate_samples(seed=0, n_samples=10000, n_categories=3):	2✔
12	"""Generate artificial samples assigned to set intersections
13
14	Parameters
15	----------
16	seed : int
17	A seed for randomisation
18	n_samples : int
19	Number of samples to generate
20	n_categories : int
21	Number of categories (named "cat0", "cat1", ...) to generate
22
23	Returns
24	-------
25	DataFrame
26	Field 'value' is a weight or score for each element.
27	Field 'index' is a unique id for each element.
28	Index includes a boolean indicator mask for each category.
29
30	Note: Further fields may be added in future versions.
31
32	See Also
33	--------
34	generate_counts : Generates the counts for each subset of categories
35	corresponding to these samples.
36	"""
37	rng = np.random.RandomState(seed)	2✔
38	df = pd.DataFrame({'value': np.zeros(n_samples)})	2✔
39	for i in range(n_categories):	2✔
40	r = rng.rand(n_samples)	2✔
41	df['cat%d' % i] = r > rng.rand()	2✔
42	df['value'] += r	2✔
43
44	df.reset_index(inplace=True)	2✔
45	df.set_index(['cat%d' % i for i in range(n_categories)], inplace=True)	2✔
46	return df	2✔
47
48
49	def generate_counts(seed=0, n_samples=10000, n_categories=3):	2✔
50	"""Generate artificial counts corresponding to set intersections
51
52	Parameters
53	----------
54	seed : int
55	A seed for randomisation
56	n_samples : int
57	Number of samples to generate statistics over
58	n_categories : int
59	Number of categories (named "cat0", "cat1", ...) to generate
60
61	Returns
62	-------
63	Series
64	Counts indexed by boolean indicator mask for each category.
65
66	See Also
67	--------
68	generate_samples : Generates a DataFrame of samples that these counts are
69	derived from.
70	"""
71	df = generate_samples(seed=seed, n_samples=n_samples,	2✔
72	n_categories=n_categories)	2✔
73	return df.value.groupby(level=list(range(n_categories))).count()	2✔
74
75
76	def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False):	2✔
77	warnings.warn('generate_data was replaced by generate_counts in version '	2✔
UNCOV 78	'0.3 and will be removed in version 0.4.',	×
79	DeprecationWarning)	2✔
80	if aggregated:	2✔
81	return generate_counts(seed=seed, n_samples=n_samples,	×
UNCOV 82	n_categories=n_sets)	×
UNCOV 83	else:	×
84	return generate_samples(seed=seed, n_samples=n_samples,	2✔
85	n_categories=n_sets)['value']	2✔
86
87
88	def from_indicators(indicators, data=None):	2✔
89	"""Load category membership indicated by a boolean indicator matrix
90
91	This loader also supports the case where the indicator columns can be
92	derived from `data`.
93
94	.. versionadded:: 0.6
95
96	Parameters
97	----------
98	indicators : DataFrame-like of booleans, Sequence of str, or callable
99	Specifies the category indicators (boolean mask arrays) within
100	``data``, i.e. which records in ``data`` belong to which categories.
101
102	If a list of strings, these should be column names found in ``data``
103	whose values are boolean mask arrays.
104
105	If a DataFrame, its columns should correspond to categories, and its
106	index should be a subset of those in ``data``, values should be True
107	where a data record is in that category, and False or NA otherwise.
108
109	If callable, it will be applied to ``data`` after the latter is
110	converted to a Series or DataFrame.
111
112	data : Series-like or DataFrame-like, optional
113	If given, the index of category membership is attached to this data.
114	It must have the same length as `indicators`.
115	If not given, the series will contain the value 1.
116
117	Returns
118	-------
119	DataFrame or Series
120	`data` is returned with its index indicating category membership.
121	It will be a Series if `data` is a Series or 1d numeric array or None.
122
123	Notes
124	-----
125	Categories with indicators that are all False will be removed.
126
127	Examples
128	--------
129	>>> import pandas as pd
130	>>> from upsetplot import from_indicators
131	>>>
132	>>> # Just indicators:
133	>>> indicators = {"cat1": [True, False, True, False],
134	... "cat2": [False, True, False, False],
135	... "cat3": [True, True, False, False]}
136	>>> from_indicators(indicators)
137	cat1 cat2 cat3
138	True False True 1.0
139	False True True 1.0
140	True False False 1.0
141	False False False 1.0
142	Name: ones, dtype: float64
143	>>>
144	>>> # Where indicators are included within data, specifying
145	>>> # columns by name:
146	>>> data = pd.DataFrame({"value": [5, 4, 6, 4], **indicators})
147	>>> from_indicators(["cat1", "cat3"], data=data)
148	value cat1 cat2 cat3
149	cat1 cat3
150	True True 5 True False True
151	False True 4 False True True
152	True False 6 True False False
153	False False 4 False False False
154	>>>
155	>>> # Making indicators out of all boolean columns:
156	>>> from_indicators(lambda data: data.select_dtypes(bool), data=data)
157	value cat1 cat2 cat3
158	cat1 cat2 cat3
159	True False True 5 True False True
160	False True True 4 False True True
161	True False False 6 True False False
162	False False False 4 False False False
163	>>>
164	>>> # Using a dataset with missing data, we can use missingness as
165	>>> # an indicator:
166	>>> data = pd.DataFrame({"val1": [pd.NA, .7, pd.NA, .9],
167	... "val2": ["male", pd.NA, "female", "female"],
168	... "val3": [pd.NA, pd.NA, 23000, 78000]})
169	>>> from_indicators(pd.isna, data=data)
170	val1 val2 val3
171	val1 val2 val3
172	True False True <NA> male <NA>
173	False True True 0.7 <NA> <NA>
174	True False False <NA> female 23000
175	False False False 0.9 female 78000
176	"""
177	if data is not None:	2✔
178	data = _convert_to_pandas(data)	2✔
179
180	if callable(indicators):	2✔
181	if data is None:	2✔
182	raise ValueError("data must be provided when indicators is "	2✔
UNCOV 183	"callable")	×
184	indicators = indicators(data)	2✔
185
186	try:	2✔
187	indicators[0]	2✔
188	except Exception:	2✔
189	pass	2✔
UNCOV 190	else:	×
191	if isinstance(indicators[0], (str, int)):	2✔
192	if data is None:	2✔
193	raise ValueError("data must be provided when indicators are "	2✔
UNCOV 194	"specified as a list of columns")	×
195	if isinstance(indicators, tuple):	2✔
196	raise ValueError("indicators as tuple is not supported")	2✔
197	# column array
198	indicators = data[indicators]	2✔
199
200	indicators = pd.DataFrame(indicators).fillna(False).infer_objects()	2✔
201	# drop all-False (should we be dropping all-True also? making an option?)
202	indicators = indicators.loc[:, indicators.any(axis=0)]	2✔
203
204	if not all(dtype.kind == 'b' for dtype in indicators.dtypes):	2✔
205	raise ValueError('The indicators must all be boolean')	2✔
206
207	if data is not None:	2✔
208	if not (isinstance(indicators.index, pd.RangeIndex)	2✔
209	and indicators.index[0] == 0	2✔
210	and indicators.index[-1] == len(data) - 1):	2✔
211	# index is specified on indicators. Need to align it to data
212	if not indicators.index.isin(data.index).all():	2✔
213	raise ValueError("If indicators.index is not the default, "	2✔
UNCOV 214	"all its values must be present in "	×
UNCOV 215	"data.index")	×
216	indicators = indicators.reindex(index=data.index, fill_value=False)	2✔
UNCOV 217	else:	×
218	data = pd.Series(np.ones(len(indicators)), name="ones")	2✔
219
220	indicators.set_index(list(indicators.columns), inplace=True)	2✔
221	data.index = indicators.index	2✔
222
223	return data	2✔
224
225
226	def _convert_to_pandas(data, copy=True):	2✔
227	is_series = False	2✔
228	if hasattr(data, 'loc'):	2✔
229	if copy:	2✔
230	data = data.copy(deep=False)	2✔
231	is_series = data.ndim == 1	2✔
232	elif len(data):	2✔
233	try:	2✔
234	is_series = isinstance(data[0], Number)	2✔
235	except KeyError:	2✔
236	is_series = False	2✔
237	if is_series:	2✔
238	data = pd.Series(data)	2✔
UNCOV 239	else:	×
240	data = pd.DataFrame(data)	2✔
241	return data	2✔
242
243
244	def from_memberships(memberships, data=None):	2✔
245	"""Load data where each sample has a collection of category names
246
247	The output should be suitable for passing to `UpSet` or `plot`.
248
249	Parameters
250	----------
251	memberships : sequence of collections of strings
252	Each element corresponds to a data point, indicating the sets it is a
253	member of. Each category is named by a string.
254	data : Series-like or DataFrame-like, optional
255	If given, the index of category memberships is attached to this data.
256	It must have the same length as `memberships`.
257	If not given, the series will contain the value 1.
258
259	Returns
260	-------
261	DataFrame or Series
262	`data` is returned with its index indicating category membership.
263	It will be a Series if `data` is a Series or 1d numeric array.
264	The index will have levels ordered by category names.
265
266	Examples
267	--------
268	>>> from upsetplot import from_memberships
269	>>> from_memberships([
270	... ['cat1', 'cat3'],
271	... ['cat2', 'cat3'],
272	... ['cat1'],
273	... []
274	... ])
275	cat1 cat2 cat3
276	True False True 1
277	False True True 1
278	True False False 1
279	False False False 1
280	Name: ones, dtype: ...
281	>>> # now with data:
282	>>> import numpy as np
283	>>> from_memberships([
284	... ['cat1', 'cat3'],
285	... ['cat2', 'cat3'],
286	... ['cat1'],
287	... []
288	... ], data=np.arange(12).reshape(4, 3))
289	0 1 2
290	cat1 cat2 cat3
291	True False True 0 1 2
292	False True True 3 4 5
293	True False False 6 7 8
294	False False False 9 10 11
295	"""
296	df = pd.DataFrame([{name: True for name in names}	2✔
297	for names in memberships])	2✔
298	for set_name in df.columns:	2✔
299	if not hasattr(set_name, 'lower'):	2✔
300	raise ValueError('Category names should be strings')	2✔
301	if df.shape[1] == 0:	2✔
302	raise ValueError('Require at least one category. None were found.')	2✔
303	df.sort_index(axis=1, inplace=True)	2✔
304	df.fillna(False, inplace=True)	2✔
305	df = df.astype(bool)	2✔
306	df.set_index(list(df.columns), inplace=True)	2✔
307	if data is None:	2✔
308	return df.assign(ones=1)['ones']	2✔
309
310	data = _convert_to_pandas(data)	2✔
311	if len(data) != len(df):	2✔
312	raise ValueError('memberships and data must have the same length. '	2✔
UNCOV 313	'Got len(memberships) == %d, len(data) == %d'	×
314	% (len(memberships), len(data)))	2✔
315	data.index = df.index	2✔
316	return data	2✔
317
318
319	def from_contents(contents, data=None, id_column='id'):	2✔
320	"""Build data from category listings
321
322	Parameters
323	----------
324	contents : Mapping (or iterable over pairs) of strings to sets
325	Keys are category names, values are sets of identifiers (int or
326	string).
327	data : DataFrame, optional
328	If provided, this should be indexed by the identifiers used in
329	`contents`.
330	id_column : str, default='id'
331	The column name to use for the identifiers in the output.
332
333	Returns
334	-------
335	DataFrame
336	`data` is returned with its index indicating category membership,
337	including a column named according to id_column.
338	If data is not given, the order of rows is not assured.
339
340	Notes
341	-----
342	The order of categories in the output DataFrame is determined from
343	`contents`, which may have non-deterministic iteration order.
344
345	Examples
346	--------
347	>>> from upsetplot import from_contents
348	>>> contents = {'cat1': ['a', 'b', 'c'],
349	... 'cat2': ['b', 'd'],
350	... 'cat3': ['e']}
351	>>> from_contents(contents)
352	id
353	cat1 cat2 cat3
354	True False False a
355	True False b
356	False False c
357	False True False d
358	False True e
359	>>> import pandas as pd
360	>>> contents = {'cat1': [0, 1, 2],
361	... 'cat2': [1, 3],
362	... 'cat3': [4]}
363	>>> data = pd.DataFrame({'favourite': ['green', 'red', 'red',
364	... 'yellow', 'blue']})
365	>>> from_contents(contents, data=data)
366	id favourite
367	cat1 cat2 cat3
368	True False False 0 green
369	True False 1 red
370	False False 2 red
371	False True False 3 yellow
372	False True 4 blue
373	"""
374	cat_series = [pd.Series(True, index=list(elements), name=name)	2✔
375	for name, elements in contents.items()]	2✔
376	if not all(s.index.is_unique for s in cat_series):	2✔
377	raise ValueError('Got duplicate ids in a category')	2✔
378
379	concat = pd.concat	2✔
380	if LooseVersion(pd.__version__) >= '0.23.0':	2✔
381	# silence the warning
382	concat = functools.partial(concat, sort=False)	2✔
383
384	df = concat(cat_series, axis=1)	2✔
385	if id_column in df.columns:	2✔
386	raise ValueError('A category cannot be named %r' % id_column)	2✔
387	df.fillna(False, inplace=True)	2✔
388	cat_names = list(df.columns)	2✔
389
390	if data is not None:	2✔
391	if set(df.columns).intersection(data.columns):	2✔
392	raise ValueError('Data columns overlap with category names')	2✔
393	if id_column in data.columns:	2✔
394	raise ValueError('data cannot contain a column named %r' %	2✔
395	id_column)	2✔
396	not_in_data = df.drop(data.index, axis=0, errors='ignore')	2✔
397	if len(not_in_data):	2✔
398	raise ValueError('Found identifiers in contents that are not in '	2✔
399	'data: %r' % not_in_data.index.values)	2✔
400	df = df.reindex(index=data.index).fillna(False)	2✔
401	df = concat([data, df], axis=1)	2✔
402	df.index.name = id_column	2✔
403	return df.reset_index().set_index(cat_names)	2✔

jnothman / UpSetPlot / 7342943552

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous