4605364020

Build Type

push

github

Committed by Wenjie Du

Commit Message

doc: update the documentation;

Run Details

2665 of 3132 relevant lines covered (85.09%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

31.03

/pypots/data/load_specific_datasets.py

"""
Functions to load supported open-source time-series datasets.
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: GLP-v3

import pandas as pd
import tsdb
from pypots.utils.logging import logger

SUPPORTED_DATASETS = [
    "physionet_2012",
]


def list_supported_datasets():
    """

    Returns
    -------
    SUPPORTED_DATASETS : list
        A list including all supported datasets.

    """
    return SUPPORTED_DATASETS


def preprocess_physionet2012(data):
    """
    Parameters
    ----------
    data : dict,
        A data dict from tsdb.load_dataset().

    Returns
    -------
    dict :
        A dict containing processed data.

    """
    X = data["X"].drop(data["static_features"], axis=1)

    def apply_func(df_temp):  # pad and truncate to set the max length of samples as 48
        missing = list(set(range(0, 48)).difference(set(df_temp["Time"])))
        missing_part = pd.DataFrame({"Time": missing})
        df_temp = pd.concat([df_temp, missing_part], ignore_index=False, sort=False)  # pad
        df_temp = df_temp.set_index("Time").sort_index().reset_index()
        df_temp = df_temp.iloc[:48]  # truncate
        return df_temp

    X = X.groupby("RecordID").apply(apply_func)
    X = X.drop("RecordID", axis=1)  #
    X = X.reset_index()
    X = X.drop(["level_1", "Time"], axis=1)
    return {"X": X, "y": data["y"]}


PREPROCESSING = {"physionet_2012": preprocess_physionet2012}


def load_specific_dataset(dataset_name, use_cache=True):
    """Load specific datasets supported by PyPOTS.
    Different from tsdb.load_dataset(), which only produces merely raw data,
    load_specific_dataset here does some preprocessing operations,
    like truncating time series to generate samples with the same length.

    Parameters
    ----------
    dataset_name : str,
        The name of the dataset to be loaded, which should be supported, i.e. in SUPPORTED_DATASETS.

    use_cache :
        Whether to use cache. This is an argument of tsdb.load_dataset().

    Returns
    -------
    data : dict,
        A dict contains the preprocessed dataset.
        Users only need to continue the preprocessing steps to generate the data they want,
        e.g. standardizing and splitting.

    """
    logger.info(
        f"Loading the dataset {dataset_name} with TSDB (https://github.com/WenjieDu/Time_Series_Database)..."
    )
    assert dataset_name in SUPPORTED_DATASETS, (
        f"Dataset {dataset_name} is not supported. "
        f"If you believe this dataset is valuable to be supported by PyPOTS,"
        f"please create an issue on GitHub "
        f"https://github.com/WenjieDu/PyPOTS/issues"
    )
    logger.info(f"Starting preprocessing {dataset_name}...")
    data = tsdb.load_dataset(dataset_name, use_cache)
    data = PREPROCESSING[dataset_name](data)
    return data

1	"""	1✔
2	Functions to load supported open-source time-series datasets.
3	"""
4
5	# Created by Wenjie Du <wenjay.du@gmail.com>
6	# License: GLP-v3
7
8	import pandas as pd	1✔
9	import tsdb	1✔
10	from pypots.utils.logging import logger	1✔
11
12	SUPPORTED_DATASETS = [	1✔
13	"physionet_2012",
14	]
15
16
17	def list_supported_datasets():	1✔
18	"""
19
20	Returns
21	-------
22	SUPPORTED_DATASETS : list
23	A list including all supported datasets.
24
25	"""
26	return SUPPORTED_DATASETS	×
27
28
29	def preprocess_physionet2012(data):	1✔
30	"""
31	Parameters
32	----------
33	data : dict,
34	A data dict from tsdb.load_dataset().
35
36	Returns
37	-------
38	dict :
39	A dict containing processed data.
40
41	"""
42	X = data["X"].drop(data["static_features"], axis=1)	×
43
44	def apply_func(df_temp): # pad and truncate to set the max length of samples as 48	×
45	missing = list(set(range(0, 48)).difference(set(df_temp["Time"])))	×
46	missing_part = pd.DataFrame({"Time": missing})	×
47	df_temp = pd.concat([df_temp, missing_part], ignore_index=False, sort=False) # pad	×
48	df_temp = df_temp.set_index("Time").sort_index().reset_index()	×
49	df_temp = df_temp.iloc[:48] # truncate	×
50	return df_temp	×
51
52	X = X.groupby("RecordID").apply(apply_func)	×
53	X = X.drop("RecordID", axis=1) #	×
54	X = X.reset_index()	×
55	X = X.drop(["level_1", "Time"], axis=1)	×
56	return {"X": X, "y": data["y"]}	×
57
58
59	PREPROCESSING = {"physionet_2012": preprocess_physionet2012}	1✔
60
61
62	def load_specific_dataset(dataset_name, use_cache=True):	1✔
63	"""Load specific datasets supported by PyPOTS.
64	Different from tsdb.load_dataset(), which only produces merely raw data,
65	load_specific_dataset here does some preprocessing operations,
66	like truncating time series to generate samples with the same length.
67
68	Parameters
69	----------
70	dataset_name : str,
71	The name of the dataset to be loaded, which should be supported, i.e. in SUPPORTED_DATASETS.
72
73	use_cache :
74	Whether to use cache. This is an argument of tsdb.load_dataset().
75
76	Returns
77	-------
78	data : dict,
79	A dict contains the preprocessed dataset.
80	Users only need to continue the preprocessing steps to generate the data they want,
81	e.g. standardizing and splitting.
82
83	"""
84	logger.info(	×
85	f"Loading the dataset {dataset_name} with TSDB (https://github.com/WenjieDu/Time_Series_Database)..."
86	)
87	assert dataset_name in SUPPORTED_DATASETS, (	×
88	f"Dataset {dataset_name} is not supported. "
89	f"If you believe this dataset is valuable to be supported by PyPOTS,"
90	f"please create an issue on GitHub "
91	f"https://github.com/WenjieDu/PyPOTS/issues"
92	)
93	logger.info(f"Starting preprocessing {dataset_name}...")	×
94	data = tsdb.load_dataset(dataset_name, use_cache)	×
95	data = PREPROCESSING[dataset_name](data)	×
96	return data	×

WenjieDu / PyPOTS / 4605364020

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous