• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

WenjieDu / PyPOTS / 4605364020

pending completion
4605364020

push

github

Wenjie Du
doc: update the documentation;

2665 of 3132 relevant lines covered (85.09%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

31.03
/pypots/data/load_specific_datasets.py
1
"""
1✔
2
Functions to load supported open-source time-series datasets.
3
"""
4

5
# Created by Wenjie Du <wenjay.du@gmail.com>
6
# License: GLP-v3
7

8
import pandas as pd
1✔
9
import tsdb
1✔
10
from pypots.utils.logging import logger
1✔
11

12
SUPPORTED_DATASETS = [
1✔
13
    "physionet_2012",
14
]
15

16

17
def list_supported_datasets():
1✔
18
    """
19

20
    Returns
21
    -------
22
    SUPPORTED_DATASETS : list
23
        A list including all supported datasets.
24

25
    """
26
    return SUPPORTED_DATASETS
×
27

28

29
def preprocess_physionet2012(data):
1✔
30
    """
31
    Parameters
32
    ----------
33
    data : dict,
34
        A data dict from tsdb.load_dataset().
35

36
    Returns
37
    -------
38
    dict :
39
        A dict containing processed data.
40

41
    """
42
    X = data["X"].drop(data["static_features"], axis=1)
×
43

44
    def apply_func(df_temp):  # pad and truncate to set the max length of samples as 48
×
45
        missing = list(set(range(0, 48)).difference(set(df_temp["Time"])))
×
46
        missing_part = pd.DataFrame({"Time": missing})
×
47
        df_temp = pd.concat([df_temp, missing_part], ignore_index=False, sort=False)  # pad
×
48
        df_temp = df_temp.set_index("Time").sort_index().reset_index()
×
49
        df_temp = df_temp.iloc[:48]  # truncate
×
50
        return df_temp
×
51

52
    X = X.groupby("RecordID").apply(apply_func)
×
53
    X = X.drop("RecordID", axis=1)  #
×
54
    X = X.reset_index()
×
55
    X = X.drop(["level_1", "Time"], axis=1)
×
56
    return {"X": X, "y": data["y"]}
×
57

58

59
PREPROCESSING = {"physionet_2012": preprocess_physionet2012}
1✔
60

61

62
def load_specific_dataset(dataset_name, use_cache=True):
1✔
63
    """Load specific datasets supported by PyPOTS.
64
    Different from tsdb.load_dataset(), which only produces merely raw data,
65
    load_specific_dataset here does some preprocessing operations,
66
    like truncating time series to generate samples with the same length.
67

68
    Parameters
69
    ----------
70
    dataset_name : str,
71
        The name of the dataset to be loaded, which should be supported, i.e. in SUPPORTED_DATASETS.
72

73
    use_cache :
74
        Whether to use cache. This is an argument of tsdb.load_dataset().
75

76
    Returns
77
    -------
78
    data : dict,
79
        A dict contains the preprocessed dataset.
80
        Users only need to continue the preprocessing steps to generate the data they want,
81
        e.g. standardizing and splitting.
82

83
    """
84
    logger.info(
×
85
        f"Loading the dataset {dataset_name} with TSDB (https://github.com/WenjieDu/Time_Series_Database)..."
86
    )
87
    assert dataset_name in SUPPORTED_DATASETS, (
×
88
        f"Dataset {dataset_name} is not supported. "
89
        f"If you believe this dataset is valuable to be supported by PyPOTS,"
90
        f"please create an issue on GitHub "
91
        f"https://github.com/WenjieDu/PyPOTS/issues"
92
    )
93
    logger.info(f"Starting preprocessing {dataset_name}...")
×
94
    data = tsdb.load_dataset(dataset_name, use_cache)
×
95
    data = PREPROCESSING[dataset_name](data)
×
96
    return data
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc