• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

KrishnaswamyLab / scprep / 5313886175

pending completion
5313886175

push

github

web-flow
fix fill_value

3 of 3 new or added lines in 1 file covered. (100.0%)

3054 of 3193 relevant lines covered (95.65%)

0.96 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.42
/scprep/utils.py
1
from decorator import decorator
1✔
2
from scipy import sparse
1✔
3

4
import importlib
1✔
5
import numbers
1✔
6
import numpy as np
1✔
7
import pandas as pd
1✔
8
import re
1✔
9
import warnings
1✔
10

11
try:
1✔
12
    ModuleNotFoundError
1✔
13
except NameError:
×
14
    # python 3.5
15
    ModuleNotFoundError = ImportError
×
16

17
__imported_pkgs = set()
1✔
18

19

20
def _try_import(pkg):
1✔
21
    try:
1✔
22
        return importlib.import_module(pkg)
1✔
23
    except ModuleNotFoundError:
1✔
24
        return None
1✔
25

26

27
def _version_check(version, min_version=None):
1✔
28
    if min_version is None:
1✔
29
        # no requirement
30
        return True
1✔
31
    min_version = str(min_version)
1✔
32
    min_version_split = re.split(r"[^0-9]+", min_version)
1✔
33
    version_split = re.split(r"[^0-9]+", version)
1✔
34
    version_major = int(version_split[0])
1✔
35
    min_major = int(min_version_split[0])
1✔
36
    if min_major > version_major:
1✔
37
        # failed major version requirement
38
        return False
1✔
39
    elif min_major < version_major:
1✔
40
        # exceeded major version requirement
41
        return True
1✔
42
    elif len(min_version_split) == 1:
1✔
43
        # no minor version requirement
44
        return True
1✔
45
    else:
46
        version_minor = int(version_split[1])
1✔
47
        min_minor = int(min_version_split[1])
1✔
48
        if min_minor > version_minor:
1✔
49
            # failed minor version requirement
50
            return False
1✔
51
        else:
52
            # met minor version requirement
53
            return True
1✔
54

55

56
def check_version(pkg, min_version=None):
1✔
57
    try:
1✔
58
        module = importlib.import_module(pkg)
1✔
59
    except ModuleNotFoundError:
1✔
60
        raise ModuleNotFoundError(
1✔
61
            "{0} not found. "
62
            "Please install it with e.g. `pip install --user {0}`".format(pkg)
63
        )
64
    if not _version_check(module.__version__, min_version):
1✔
65
        raise ImportError(
1✔
66
            "{0}>={1} is required (installed: {2}). "
67
            "Please upgrade it with e.g."
68
            " `pip install --user --upgrade {0}`".format(
69
                pkg, min_version, module.__version__
70
            )
71
        )
72

73

74
@decorator
1✔
75
def _with_pkg(fun, pkg=None, min_version=None, *args, **kwargs):
1✔
76
    global __imported_pkgs
77
    if (pkg, min_version) not in __imported_pkgs:
1✔
78
        check_version(pkg, min_version=min_version)
1✔
79
        __imported_pkgs.add((pkg, min_version))
1✔
80
    return fun(*args, **kwargs)
1✔
81

82

83
def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False):
1✔
84
    """Get a cutoff for a dataset.
85

86
    Parameters
87
    ----------
88
    data : array-like
89
    cutoff : float or None, optional (default: None)
90
        Absolute cutoff value. Only one of cutoff and percentile may be given
91
    percentile : float or None, optional (default: None)
92
        Percentile cutoff value between 0 and 100.
93
        Only one of cutoff and percentile may be given
94
    required : bool, optional (default: False)
95
        If True, one of cutoff and percentile must be given.
96

97
    Returns
98
    -------
99
    cutoff : float or None
100
        Absolute cutoff value. Can only be None if required is False and
101
        cutoff and percentile are both None.
102
    """
103
    if percentile is not None:
1✔
104
        if cutoff is not None:
1✔
105
            raise ValueError(
1✔
106
                "Only one of `cutoff` and `percentile` should be given."
107
                "Got cutoff={}, percentile={}".format(cutoff, percentile)
108
            )
109
        if not isinstance(percentile, numbers.Number):
1✔
110
            return [_get_percentile_cutoff(data, percentile=p) for p in percentile]
1✔
111
        if percentile < 1:
1✔
112
            warnings.warn(
1✔
113
                "`percentile` expects values between 0 and 100."
114
                "Got {}. Did you mean {}?".format(percentile, percentile * 100),
115
                UserWarning,
116
            )
117
        cutoff = np.percentile(np.array(data).reshape(-1), percentile)
1✔
118
    elif cutoff is None and required:
1✔
119
        raise ValueError("One of either `cutoff` or `percentile` must be given.")
1✔
120
    return cutoff
1✔
121

122

123
def _get_filter_idx(values, cutoff, percentile, keep_cells):
1✔
124
    """Return a boolean array to index cells based on a filter.
125

126
    Parameters
127
    ----------
128
    values : list-like, shape=[n_samples]
129
        Value upon which to filter
130
    cutoff : float or tuple of floats, optional (default: None)
131
        Value above or below which to retain cells. Only one of `cutoff`
132
        and `percentile` should be specified.
133
    percentile : int or tuple of ints, optional (Default: None)
134
        Percentile above or below which to retain cells.
135
        Must be an integer between 0 and 100. Only one of `cutoff`
136
        and `percentile` should be specified.
137
    keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
138
        Keep cells above, below or between the cutoff.
139
        If None, defaults to 'above' when a single cutoff is given and
140
        'between' when two cutoffs are given.
141

142
    Returns
143
    -------
144
    keep_cells_idx : list-like
145
        Boolean retention array
146
    """
147
    cutoff = _get_percentile_cutoff(values, cutoff, percentile, required=True)
1✔
148
    if keep_cells is None:
1✔
149
        if isinstance(cutoff, numbers.Number):
1✔
150
            keep_cells = "above"
1✔
151
        else:
152
            keep_cells = "between"
1✔
153
    if keep_cells == "above":
1✔
154
        if not isinstance(cutoff, numbers.Number):
1✔
155
            raise ValueError(
1✔
156
                "Expected a single cutoff with keep_cells='above'."
157
                " Got {}".format(cutoff)
158
            )
159
        keep_cells_idx = values > cutoff
1✔
160
    elif keep_cells == "below":
1✔
161
        if not isinstance(cutoff, numbers.Number):
1✔
162
            raise ValueError(
1✔
163
                "Expected a single cutoff with keep_cells='below'."
164
                " Got {}".format(cutoff)
165
            )
166
        keep_cells_idx = values < cutoff
1✔
167
    elif keep_cells == "between":
1✔
168
        if isinstance(cutoff, numbers.Number) or len(cutoff) != 2:
1✔
169
            raise ValueError(
1✔
170
                "Expected cutoff of length 2 with keep_cells='between'."
171
                " Got {}".format(cutoff)
172
            )
173
        keep_cells_idx = np.logical_and(
1✔
174
            values > np.min(cutoff), values < np.max(cutoff)
175
        )
176
    else:
177
        raise ValueError(
1✔
178
            "Expected `keep_cells` in ['above', 'below', 'between']. "
179
            "Got {}".format(keep_cells)
180
        )
181
    return keep_cells_idx
1✔
182

183

184
def _check_numpy_dtype(x):
1✔
185
    try:
1✔
186
        if all([len(xi) == len(x[0]) for xi in x]):
1✔
187
            # all sequences of the same length; infer dtype
188
            return None
1✔
189
        else:
190
            # sequences of different lengths; object dtype is forced
191
            return object
1✔
192
    except TypeError as e:
1✔
193
        if str(e).startswith("sparse matrix length is ambiguous"):
1✔
194
            # list contains sparse matrices; must be object
195
            return object
1✔
196
        elif str(e).endswith("has no len()"):
1✔
197
            if any([hasattr(xi, "__len__") for xi in x]):
1✔
198
                # some sequences and some not; must be object
199
                return object
×
200
            else:
201
                # no sequences; infer
202
                return None
1✔
203
        else:
204
            raise
×
205

206

207
def toarray(x):
1✔
208
    """Convert an array-like to a np.ndarray.
209

210
    Parameters
211
    ----------
212
    x : array-like
213
        Array-like to be converted
214
    Returns
215
    -------
216
    x : np.ndarray
217
    """
218
    if is_SparseDataFrame(x):
1✔
219
        x = x.to_coo().toarray()
×
220
    elif is_SparseSeries(x):
1✔
221
        x = x.to_dense().to_numpy()
×
222
    elif isinstance(x, (pd.DataFrame, pd.Series, pd.Index)):
1✔
223
        x = x.to_numpy()
1✔
224
    elif isinstance(x, sparse.spmatrix):
1✔
225
        x = x.toarray()
1✔
226
    elif isinstance(x, np.matrix):
1✔
227
        x = x.A
1✔
228
    elif isinstance(x, list):
1✔
229
        x_out = []
1✔
230
        for xi in x:
1✔
231
            try:
1✔
232
                xi = toarray(xi)
1✔
233
            except TypeError:
1✔
234
                # recursed too far
235
                pass
1✔
236
            x_out.append(xi)
1✔
237
        # convert x_out from list to array
238
        x = np.array(x_out, dtype=_check_numpy_dtype(x_out))
1✔
239
    elif isinstance(x, (np.ndarray, numbers.Number)):
1✔
240
        pass
1✔
241
    else:
242
        raise TypeError("Expected array-like. Got {}".format(type(x)))
1✔
243
    return x
1✔
244

245

246
def to_array_or_spmatrix(x):
1✔
247
    """Convert an array-like to a np.ndarray or scipy.sparse.spmatrix.
248

249
    Parameters
250
    ----------
251
    x : array-like
252
        Array-like to be converted
253
    Returns
254
    -------
255
    x : np.ndarray or scipy.sparse.spmatrix
256
    """
257
    if is_SparseDataFrame(x):
1✔
258
        x = x.to_coo()
×
259
    elif is_sparse_dataframe(x) or is_sparse_series(x):
1✔
260
        x = x.sparse.to_coo()
1✔
261
    elif isinstance(
1✔
262
        x, (sparse.spmatrix, np.ndarray, numbers.Number)
263
    ) and not isinstance(x, np.matrix):
264
        pass
1✔
265
    elif isinstance(x, list):
1✔
266
        x_out = []
1✔
267
        for xi in x:
1✔
268
            try:
1✔
269
                xi = to_array_or_spmatrix(xi)
1✔
270
            except TypeError:
1✔
271
                # recursed too far
272
                pass
1✔
273
            x_out.append(xi)
1✔
274
        # convert x_out from list to array
275
        x = np.array(x_out, dtype=_check_numpy_dtype(x_out))
1✔
276
    else:
277
        x = toarray(x)
1✔
278
    return x
1✔
279

280

281
def is_SparseSeries(X):
1✔
282
    with warnings.catch_warnings():
1✔
283
        warnings.filterwarnings(
1✔
284
            "ignore",
285
            "The SparseSeries class is removed from pandas. Accessing it from the "
286
            "top-level namespace will also be removed in the next version",
287
            FutureWarning,
288
        )
289
        try:
1✔
290
            return isinstance(X, pd.SparseSeries)
1✔
291
        except AttributeError:
×
292
            return False
×
293

294

295
def is_SparseDataFrame(X):
1✔
296
    with warnings.catch_warnings():
1✔
297
        warnings.filterwarnings(
1✔
298
            "ignore",
299
            "The SparseDataFrame class is removed from pandas. Accessing it from the "
300
            "top-level namespace will also be removed in the next version",
301
            FutureWarning,
302
        )
303
        try:
1✔
304
            return isinstance(X, pd.SparseDataFrame)
1✔
305
        except AttributeError:
×
306
            return False
×
307

308

309
def is_sparse_dataframe(x):
1✔
310
    if isinstance(x, pd.DataFrame) and not is_SparseDataFrame(x):
1✔
311
        try:
1✔
312
            x.sparse
1✔
313
            return True
1✔
314
        except AttributeError:
1✔
315
            pass
1✔
316
    return False
1✔
317

318

319
def is_sparse_series(x):
1✔
320
    if isinstance(x, pd.Series) and not is_SparseSeries(x):
1✔
321
        try:
1✔
322
            x.sparse
1✔
323
            return True
1✔
324
        except AttributeError:
1✔
325
            pass
1✔
326
    return False
1✔
327

328

329
def dataframe_to_sparse(x, fill_value=0.0):
1✔
330
    x = pd.DataFrame.sparse.from_spmatrix(
1✔
331
        sparse.coo_matrix(x.values), index=x.index, columns=x.columns
332
    )
333
    x.sparse.fill_value = fill_value
1✔
334
    return x
1✔
335

336

337
def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0):
1✔
338
    if sparse.issparse(X):
1✔
339
        X = pd.DataFrame.sparse.from_spmatrix(X)
1✔
340
        X.sparse.fill_value = default_fill_value
1✔
341
    else:
342
        if is_SparseDataFrame(X) or not isinstance(X, pd.DataFrame):
1✔
343
            X = pd.DataFrame(X)
1✔
344
        X = dataframe_to_sparse(X, fill_value=default_fill_value)
1✔
345
    if columns is not None:
1✔
346
        X.columns = columns
1✔
347
    if index is not None:
1✔
348
        X.index = index
1✔
349
    return X
1✔
350

351

352
def matrix_transform(data, fun, *args, **kwargs):
1✔
353
    """Perform a numerical transformation to data.
354

355
    Parameters
356
    ----------
357
    data : array-like, shape=[n_samples, n_features]
358
        Input data
359
    fun : callable
360
        Numerical transformation function, `np.ufunc` or similar.
361
    args, kwargs : additional arguments, optional
362
        arguments for `fun`. `data` is always passed as the first argument
363

364
    Returns
365
    -------
366
    data : array-like, shape=[n_samples, n_features]
367
        Transformed output data
368
    """
369
    if is_sparse_dataframe(data) or is_SparseDataFrame(data):
1✔
370
        data = data.copy()
1✔
371
        for col in data.columns:
1✔
372
            data[col] = fun(data[col], *args, **kwargs)
1✔
373
    elif sparse.issparse(data):
1✔
374
        if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
1✔
375
            data = data.tocsr()
1✔
376
        else:
377
            # avoid modifying in place
378
            data = data.copy()
1✔
379
        data.data = fun(data.data, *args, **kwargs)
1✔
380
    else:
381
        data = fun(data, *args, **kwargs)
1✔
382
    return data
1✔
383

384

385
def fillna(data, fill, copy=True):
1✔
386
    return_cls = None
1✔
387
    if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
1✔
388
        return_cls = type(data)
1✔
389
        assert copy, f"Cannot fillna in-place for {return_cls.__name__}"
1✔
390
        data = data.tocsr()
1✔
391
    elif copy:
1✔
392
        data = data.copy()
1✔
393
    if sparse.issparse(data):
1✔
394
        data.data[np.isnan(data.data)] = fill
1✔
395
        if return_cls is not None:
1✔
396
            data = return_cls(data)
1✔
397
    else:
398
        data[np.isnan(data)] = fill
1✔
399
    return data
1✔
400

401

402
def _nansum(data, axis=None):
1✔
403
    if sparse.issparse(data):
1✔
404
        return np.sum(fillna(data, 0), axis=axis)
1✔
405
    else:
406
        return np.nansum(data, axis=axis)
1✔
407

408

409
def matrix_sum(data, axis=None, ignore_nan=False):
1✔
410
    """Get the column-wise, row-wise, or total sum of values in a matrix.
411

412
    Parameters
413
    ----------
414
    data : array-like, shape=[n_samples, n_features]
415
        Input data
416
    axis : int or None, optional (default: None)
417
        Axis across which to sum. axis=0 gives column sums,
418
        axis=1 gives row sums. None gives the total sum.
419
    ignore_nan : bool, optional (default: False)
420
        If True, uses `np.nansum` instead of `np.sum`
421

422
    Returns
423
    -------
424
    sums : array-like or float
425
        Sums along desired axis.
426
    """
427
    sum_fn = _nansum if ignore_nan else np.sum
1✔
428
    if axis not in [0, 1, None]:
1✔
429
        raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
1✔
430
    if isinstance(data, pd.DataFrame):
1✔
431
        if is_SparseDataFrame(data):
1✔
432
            if axis is None:
×
433
                sums = sum_fn(data.to_coo())
×
434
            else:
435
                index = data.index if axis == 1 else data.columns
×
436
                sums = pd.Series(
×
437
                    np.array(sum_fn(data.to_coo(), axis)).flatten(), index=index
438
                )
439
        elif is_sparse_dataframe(data):
1✔
440
            if axis is None:
1✔
441
                sums = sum_fn(data.sparse.to_coo())
1✔
442
            else:
443
                index = data.index if axis == 1 else data.columns
1✔
444
                sums = pd.Series(
1✔
445
                    np.array(sum_fn(data.sparse.to_coo(), axis)).flatten(), index=index
446
                )
447
        elif axis is None:
1✔
448
            sums = sum_fn(data.to_numpy())
1✔
449
        else:
450
            sums = sum_fn(data, axis)
1✔
451
    else:
452
        sums = sum_fn(data, axis=axis)
1✔
453
        if isinstance(sums, np.matrix):
1✔
454
            sums = np.array(sums).flatten()
1✔
455
    return sums
1✔
456

457

458
def matrix_std(data, axis=None):
1✔
459
    """Get the column-wise, row-wise, or total standard deviation of a matrix.
460

461
    Parameters
462
    ----------
463
    data : array-like, shape=[n_samples, n_features]
464
        Input data
465
    axis : int or None, optional (default: None)
466
        Axis across which to calculate standard deviation.
467
        axis=0 gives column standard deviation,
468
        axis=1 gives row standard deviation.
469
        None gives the total standard deviation.
470

471
    Returns
472
    -------
473
    std : array-like or float
474
        Standard deviation along desired axis.
475
    """
476
    if axis not in [0, 1, None]:
1✔
477
        raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
1✔
478
    index = None
1✔
479
    if isinstance(data, pd.DataFrame) and axis is not None:
1✔
480
        if axis == 1:
1✔
481
            index = data.index
1✔
482
        elif axis == 0:
1✔
483
            index = data.columns
1✔
484
    data = to_array_or_spmatrix(data)
1✔
485
    if sparse.issparse(data):
1✔
486
        if axis is None:
1✔
487
            if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
1✔
488
                data = data.tocoo()
1✔
489
            data_sq = data.copy()
1✔
490
            data_sq.data = data_sq.data**2
1✔
491
            variance = data_sq.mean() - data.mean() ** 2
1✔
492
            std = np.sqrt(variance)
1✔
493
        else:
494
            if axis == 0:
1✔
495
                data = data.tocsc()
1✔
496
                next_fn = data.getcol
1✔
497
                N = data.shape[1]
1✔
498
            elif axis == 1:
1✔
499
                data = data.tocsr()
1✔
500
                next_fn = data.getrow
1✔
501
                N = data.shape[0]
1✔
502
            std = []
1✔
503
            for i in range(N):
1✔
504
                col = next_fn(i)
1✔
505
                col_sq = col.copy()
1✔
506
                col_sq.data = col_sq.data**2
1✔
507
                variance = col_sq.mean() - col.mean() ** 2
1✔
508
                std.append(np.sqrt(variance))
1✔
509
            std = np.array(std)
1✔
510
    else:
511
        std = np.std(data, axis=axis)
1✔
512
    if index is not None:
1✔
513
        std = pd.Series(std, index=index, name="std")
1✔
514
    return std
1✔
515

516

517
def matrix_vector_elementwise_multiply(data, multiplier, axis=None):
1✔
518
    """Elementwise multiply a matrix by a vector.
519

520
    Parameters
521
    ----------
522
    data : array-like, shape=[n_samples, n_features]
523
        Input data
524
    multiplier : array-like, shape=[n_samples, 1] or [1, n_features]
525
        Vector by which to multiply `data`
526
    axis : int or None, optional (default: None)
527
        Axis across which to sum. axis=0 multiplies each column,
528
        axis=1 multiplies each row. None guesses based on dimensions
529

530
    Returns
531
    -------
532
    product : array-like
533
        Multiplied matrix
534
    """
535
    if axis not in [0, 1, None]:
1✔
536
        raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
1✔
537

538
    if axis is None:
1✔
539
        if data.shape[0] == data.shape[1]:
1✔
540
            raise RuntimeError(
1✔
541
                "`data` is square, cannot guess axis from input. "
542
                "Please provide `axis=0` to multiply along rows or "
543
                "`axis=1` to multiply along columns."
544
            )
545
        elif np.prod(multiplier.shape) == data.shape[0]:
1✔
546
            axis = 0
1✔
547
        elif np.prod(multiplier.shape) == data.shape[1]:
1✔
548
            axis = 1
1✔
549
        else:
550
            raise ValueError(
1✔
551
                "Expected `multiplier` to be a vector of length "
552
                "`data.shape[0]` ({}) or `data.shape[1]` ({}). Got {}".format(
553
                    data.shape[0], data.shape[1], multiplier.shape
554
                )
555
            )
556
    multiplier = toarray(multiplier)
1✔
557
    if axis == 0:
1✔
558
        if not np.prod(multiplier.shape) == data.shape[0]:
1✔
559
            raise ValueError(
1✔
560
                "Expected `multiplier` to be a vector of length "
561
                "`data.shape[0]` ({}). Got {}".format(data.shape[0], multiplier.shape)
562
            )
563
        multiplier = multiplier.reshape(-1, 1)
1✔
564
    else:
565
        if not np.prod(multiplier.shape) == data.shape[1]:
1✔
566
            raise ValueError(
1✔
567
                "Expected `multiplier` to be a vector of length "
568
                "`data.shape[1]` ({}). Got {}".format(data.shape[1], multiplier.shape)
569
            )
570
        multiplier = multiplier.reshape(1, -1)
1✔
571

572
    if is_SparseDataFrame(data) or is_sparse_dataframe(data):
1✔
573
        data = data.copy()
1✔
574
        multiplier = multiplier.flatten()
1✔
575
        if axis == 0:
1✔
576
            for col in data.columns:
1✔
577
                try:
1✔
578
                    mult_indices = data[col].values.sp_index.indices
1✔
579
                except AttributeError:
×
580
                    mult_indices = data[col].values.sp_index.to_int_index().indices
×
581
                new_data = data[col].values.sp_values * multiplier[mult_indices]
1✔
582
                data[col].values.sp_values.put(
1✔
583
                    np.arange(data[col].sparse.npoints), new_data
584
                )
585
        else:
586
            for col, mult in zip(data.columns, multiplier):
1✔
587
                data[col] = data[col] * mult
1✔
588
    elif isinstance(data, pd.DataFrame):
1✔
589
        data = data.mul(multiplier.flatten(), axis=axis)
1✔
590
    elif sparse.issparse(data):
1✔
591
        if isinstance(
1✔
592
            data,
593
            (
594
                sparse.lil_matrix,
595
                sparse.dok_matrix,
596
                sparse.coo_matrix,
597
                sparse.bsr_matrix,
598
                sparse.dia_matrix,
599
            ),
600
        ):
601
            data = data.tocsr()
1✔
602
        data = data.multiply(multiplier)
1✔
603
    else:
604
        data = data * multiplier
1✔
605

606
    return data
1✔
607

608

609
def sparse_series_min(data):
1✔
610
    """Get the minimum value from a pandas sparse series.
611

612
    Pandas SparseDataFrame does not handle np.min.
613

614
    Parameters
615
    ----------
616
    data : pd.Series[SparseArray]
617
        Input data
618

619
    Returns
620
    -------
621
    minimum : float
622
        Minimum entry in `data`.
623
    """
624
    return np.concatenate([data.sparse.sp_values, [data.sparse.fill_value]]).min()
1✔
625

626

627
def matrix_min(data):
1✔
628
    """Get the minimum value from a data matrix.
629

630
    Pandas SparseDataFrame does not handle np.min.
631

632
    Parameters
633
    ----------
634
    data : array-like, shape=[n_samples, n_features]
635
        Input data
636

637
    Returns
638
    -------
639
    minimum : float
640
        Minimum entry in `data`.
641
    """
642
    if is_SparseDataFrame(data):
1✔
643
        data = [np.min(data[col]) for col in data.columns]
×
644
    elif is_sparse_dataframe(data):
1✔
645
        data = [sparse_series_min(data[col]) for col in data.columns]
1✔
646
    elif isinstance(data, pd.DataFrame):
1✔
647
        data = np.min(data)
1✔
648
    elif isinstance(data, sparse.lil_matrix):
1✔
649
        data = [np.min(d) for d in data.data] + [0]
1✔
650
    elif isinstance(data, sparse.dok_matrix):
1✔
651
        data = list(data.values()) + [0]
1✔
652
    elif isinstance(data, sparse.dia_matrix):
1✔
653
        data = [np.min(data.data), 0]
1✔
654
    return np.min(data)
1✔
655

656

657
def matrix_non_negative(data, allow_equal=True):
1✔
658
    """Check if all values in a matrix are non-negative.
659

660
    Parameters
661
    ----------
662
    data : array-like, shape=[n_samples, n_features]
663
        Input data
664
    allow_equal : bool, optional (default: True)
665
        If True, min(data) can be equal to 0
666

667
    Returns
668
    -------
669
    is_non_negative : bool
670
    """
671
    return matrix_min(data) >= 0 if allow_equal else matrix_min(data) > 0
1✔
672

673

674
def matrix_any(condition):
1✔
675
    """Check if a condition is true anywhere in a data matrix.
676

677
    np.any doesn't handle matrices of type pd.DataFrame
678

679
    Parameters
680
    ----------
681
    condition : array-like
682
        Boolean matrix
683

684
    Returns
685
    -------
686
    any : bool
687
        True if condition contains any True values, False otherwise
688
    """
689
    return np.sum(np.sum(condition)) > 0
1✔
690

691

692
def matrix_transpose(X):
1✔
693
    """Transpose a matrix in a memory-efficient manner.
694

695
    Pandas sparse dataframes are coerced to dense
696

697
    Parameters
698
    ----------
699
    X : array-like, shape=[n,m]
700
        Input data
701

702
    Returns
703
    -------
704
    X_T : array-like, shape=[m,n]
705
        Transposed input data
706
    """
707
    if is_sparse_dataframe(X):
1✔
708
        fill_values = np.array([dtype.fill_value for dtype in X.dtypes])
1✔
709
        if not np.all(fill_values == fill_values[0]):
1✔
710
            raise TypeError(
1✔
711
                "Can only transpose sparse dataframes with constant fill value. "
712
                "If you wish to proceed, first convert the data to dense with "
713
                "scprep.utils.toarray."
714
            )
715
        X_T = X.sparse.to_coo().T
1✔
716
        return SparseDataFrame(
1✔
717
            X_T, index=X.columns, columns=X.index, default_fill_value=fill_values[0]
718
        )
719
    else:
720
        return X.T
1✔
721

722

723
def check_consistent_columns(data, common_columns_only=True):
1✔
724
    """Ensure that a set of data matrices have consistent columns.
725

726
    Parameters
727
    ----------
728
    data : list of array-likes
729
        List of matrices to be checked
730
    common_columns_only : bool, optional (default: True)
731
        With pandas inputs, drop any columns that are not common to
732
        all matrices
733

734
    Returns
735
    -------
736
    data : list of array-likes
737
        List of matrices with consistent columns, subsetted if necessary
738

739
    Raises
740
    ------
741
    ValueError
742
        Raised if data has inconsistent number of columns and does not
743
        have column names for subsetting
744
    """
745
    matrix_type = type(data[0])
1✔
746
    matrix_shape = data[0].shape[1]
1✔
747
    if issubclass(matrix_type, pd.DataFrame):
1✔
748
        if not (
1✔
749
            np.all([d.shape[1] == matrix_shape for d in data[1:]])
750
            and np.all([data[0].columns == d.columns for d in data])
751
        ):
752
            if common_columns_only:
1✔
753
                common_genes = data[0].columns.values
1✔
754
                for d in data[1:]:
1✔
755
                    common_genes = common_genes[np.isin(common_genes, d.columns.values)]
1✔
756
                warnings.warn(
1✔
757
                    "Input data has inconsistent column names. "
758
                    "Subsetting to {} common columns. "
759
                    "To retain all columns, use "
760
                    "`common_columns_only=False`.".format(len(common_genes)),
761
                    UserWarning,
762
                )
763
                for i in range(len(data)):
1✔
764
                    data[i] = data[i][common_genes]
1✔
765
            else:
766
                columns = [d.columns.values for d in data]
1✔
767
                all_columns = np.unique(np.concatenate(columns))
1✔
768
                warnings.warn(
1✔
769
                    "Input data has inconsistent column names. "
770
                    "Padding with zeros to {} total columns.".format(len(all_columns)),
771
                    UserWarning,
772
                )
773
    else:
774
        for d in data[1:]:
1✔
775
            if not d.shape[1] == matrix_shape:
1✔
776
                shapes = ", ".join([str(d.shape[1]) for d in data])
1✔
777
                raise ValueError(
1✔
778
                    "Expected data all with the same number of "
779
                    "columns. Got {}".format(shapes)
780
                )
781
    return data
1✔
782

783

784
def combine_batches(
1✔
785
    data, batch_labels, append_to_cell_names=None, common_columns_only=True
786
):
787
    """Combine data matrices from multiple batches and store a batch label.
788

789
    Parameters
790
    ----------
791
    data : list of array-like, shape=[n_batch]
792
        All matrices must be of the same format and have the same number of
793
        columns (or genes.)
794
    batch_labels : list of `str`, shape=[n_batch]
795
        List of names assigned to each batch
796
    append_to_cell_names : bool, optional (default: None)
797
        If input is a pandas dataframe, add the batch label corresponding to
798
        each cell to its existing index (or cell name / barcode.)
799
        Default behavior is `True` for dataframes and `False` otherwise.
800
    common_columns_only : bool, optional (default: True)
801
        With pandas inputs, drop any columns that are not common to
802
        all data matrices
803

804
    Returns
805
    -------
806
    data : data matrix, shape=[n_samples, n_features]
807
        Number of samples is the sum of numbers of samples of all batches.
808
        Number of features is the same as each of the batches.
809
    sample_labels : list-like, shape=[n_samples]
810
        Batch labels corresponding to each sample
811
    """
812
    if not len(data) == len(batch_labels):
1✔
813
        raise ValueError(
1✔
814
            "Expected data ({}) and batch_labels ({}) to be the "
815
            "same length.".format(len(data), len(batch_labels))
816
        )
817

818
    # check consistent type
819
    matrix_type = type(data[0])
1✔
820
    if is_SparseDataFrame(data[0]):
1✔
821
        matrix_type = pd.DataFrame
×
822
    if not issubclass(matrix_type, (np.ndarray, pd.DataFrame, sparse.spmatrix)):
1✔
823
        raise ValueError(
1✔
824
            "Expected data to contain pandas DataFrames, "
825
            "scipy sparse matrices or numpy arrays. "
826
            "Got {}".format(matrix_type.__name__)
827
        )
828
    for d in data[1:]:
1✔
829
        if not isinstance(d, matrix_type):
1✔
830
            types = ", ".join([type(d).__name__ for d in data])
1✔
831
            raise TypeError(
1✔
832
                "Expected data all of the same class. " "Got {}".format(types)
833
            )
834

835
    data = check_consistent_columns(data, common_columns_only=common_columns_only)
1✔
836

837
    # check append_to_cell_names
838
    if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame):
1✔
839
        warnings.warn(
1✔
840
            "append_to_cell_names only valid for pd.DataFrame input."
841
            " Got {}".format(matrix_type.__name__),
842
            UserWarning,
843
        )
844
    elif append_to_cell_names is None:
1✔
845
        if issubclass(matrix_type, pd.DataFrame):
1✔
846
            if all([isinstance(d.index, pd.RangeIndex) for d in data]):
1✔
847
                # rangeindex should still be a rangeindex
848
                append_to_cell_names = False
1✔
849
            else:
850
                append_to_cell_names = True
1✔
851
        else:
852
            append_to_cell_names = False
1✔
853

854
    # concatenate labels
855
    sample_labels = np.concatenate(
1✔
856
        [np.repeat(batch_labels[i], d.shape[0]) for i, d in enumerate(data)]
857
    )
858

859
    # conatenate data
860
    if issubclass(matrix_type, pd.DataFrame):
1✔
861
        data_combined = pd.concat(data, axis=0, sort=True, join="outer").fillna(0)
1✔
862
        if append_to_cell_names:
1✔
863
            index = np.concatenate(
1✔
864
                [
865
                    np.core.defchararray.add(
866
                        np.array(d.index, dtype=str), "_" + str(batch_labels[i])
867
                    )
868
                    for i, d in enumerate(data)
869
                ]
870
            )
871
            data_combined.index = index
1✔
872
        elif all([isinstance(d.index, pd.RangeIndex) for d in data]):
1✔
873
            # rangeindex should still be a rangeindex
874
            data_combined = data_combined.reset_index(drop=True)
1✔
875
        sample_labels = pd.Series(
1✔
876
            sample_labels, index=data_combined.index, name="sample_labels"
877
        )
878
    elif issubclass(matrix_type, sparse.spmatrix):
1✔
879
        data_combined = sparse.vstack(data)
1✔
880
    elif issubclass(matrix_type, np.ndarray):
1✔
881
        data_combined = np.vstack(data)
1✔
882

883
    return data_combined, sample_labels
1✔
884

885

886
def select_cols(data, idx):
1✔
887
    raise RuntimeError(
1✔
888
        "`scprep.utils.select_cols` is deprecated. Use "
889
        "`scprep.select.select_cols` instead."
890
    )
891

892

893
def select_rows(data, idx):
1✔
894
    raise RuntimeError(
1✔
895
        "`scprep.utils.select_rows` is deprecated. Use "
896
        "`scprep.select.select_rows` instead."
897
    )
898

899

900
def get_gene_set(data, starts_with=None, ends_with=None, regex=None):
1✔
901
    raise RuntimeError(
1✔
902
        "`scprep.utils.get_gene_set` is deprecated. Use "
903
        "`scprep.select.get_gene_set` instead."
904
    )
905

906

907
def get_cell_set(data, starts_with=None, ends_with=None, regex=None):
1✔
908
    raise RuntimeError(
1✔
909
        "`scprep.utils.get_cell_set` is deprecated. Use "
910
        "`scprep.select.get_cell_set` instead."
911
    )
912

913

914
def subsample(*data, n=10000, seed=None):
1✔
915
    raise RuntimeError(
1✔
916
        "`scprep.utils.subsample` is deprecated. Use "
917
        "`scprep.select.subsample` instead."
918
    )
919

920

921
def sort_clusters_by_values(clusters, values):
1✔
922
    """Sort `clusters` in increasing order of `values`.
923

924
    Parameters
925
    ----------
926
    clusters : array-like
927
        An array of cluster assignments, like the output of
928
        a `fit_predict()` call.
929
    values : type
930
        An associated value for each index in `clusters` to use
931
        for sorting the clusters.
932

933
    Returns
934
    -------
935
    new_clusters : array-likes
936
        Reordered cluster assignments. `np.mean(values[new_clusters == 0])`
937
        will be less than `np.mean(values[new_clusters == 1])` which
938
        will be less than `np.mean(values[new_clusters == 2])`
939
        and so on.
940

941
    """
942
    clusters = toarray(clusters)
1✔
943
    values = toarray(values)
1✔
944
    if not len(clusters) == len(values):
1✔
945
        raise ValueError(
1✔
946
            "Expected clusters ({}) and values ({}) to be the "
947
            "same length.".format(len(clusters), len(values))
948
        )
949

950
    uniq_clusters = np.unique(clusters)
1✔
951
    means = np.array([np.mean(values[clusters == cl]) for cl in uniq_clusters])
1✔
952
    new_clust_map = {
1✔
953
        curr_cl: i for i, curr_cl in enumerate(uniq_clusters[np.argsort(means)])
954
    }
955

956
    return np.array([new_clust_map[cl] for cl in clusters])
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc