10832566580

Committed 12 Sep 2024 02:09PM UTC coverage: 77.075% (-0.4%) from 77.508%

Build # 10832566580

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #19 from WenjieDu/dev

Add random walk dataset and release v0.3

Run Details

76 of 102 new or added lines in 3 files covered. (74.51%)

2 existing lines in 2 files now uncovered.

585 of 759 relevant lines covered (77.08%)

3.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

16.33

/benchpots/datasets/solar_alabama.py

"""
Preprocessing func for the dataset Solar Alabama.

"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

import pandas as pd
import tsdb
from sklearn.preprocessing import StandardScaler

from ..utils.logging import logger, print_final_dataset_info
from ..utils.missingness import create_missingness
from ..utils.sliding import sliding_window


def preprocess_solar_alabama(
    rate,
    n_steps,
    pattern: str = "point",
    **kwargs,
) -> dict:
    """Load and preprocess the dataset Solar Alabama.

    Parameters
    ----------
    rate:
        The missing rate.

    n_steps:
        The number of time steps to in the generated data samples.
        Also the window size of the sliding window.

    pattern:
        The missing pattern to apply to the dataset.
        Must be one of ['point', 'subseq', 'block'].

    Returns
    -------
    processed_dataset :
        A dictionary containing the processed Solar Alabama.
    """

    assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"
    assert n_steps > 0, f"sample_n_steps must be larger than 0, but got {n_steps}"

    # read the raw data
    data = tsdb.load("solar_alabama")
    df = data["X"]

    feature_names = df.columns.tolist()
    feature_names.remove("date")
    df["date"] = pd.to_datetime(df["date"])

    unique_months = df["date"].dt.to_period("M").unique()
    selected_as_train = unique_months[:6]  # use the first 6 months as train set
    logger.info(f"months selected as train set are {selected_as_train}")
    selected_as_val = unique_months[6:9]  # select the following 3 months as val set
    logger.info(f"months selected as val set are {selected_as_val}")
    selected_as_test = unique_months[9:]  # select the left 3 months as test set
    logger.info(f"months selected as test set are {selected_as_test}")

    test_set = df[df["date"].dt.to_period("M").isin(selected_as_test)]
    val_set = df[df["date"].dt.to_period("M").isin(selected_as_val)]
    train_set = df[df["date"].dt.to_period("M").isin(selected_as_train)]

    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_set.loc[:, feature_names])
    val_X = scaler.transform(val_set.loc[:, feature_names])
    test_X = scaler.transform(test_set.loc[:, feature_names])

    train_X = sliding_window(train_X, n_steps)
    val_X = sliding_window(val_X, n_steps)
    test_X = sliding_window(test_X, n_steps)

    # assemble the final processed data into a dictionary
    processed_dataset = {
        # general info
        "n_steps": n_steps,
        "n_features": train_X.shape[-1],
        "scaler": scaler,
        # train set
        "train_X": train_X,
        # val set
        "val_X": val_X,
        # test set
        "test_X": test_X,
    }

    if rate > 0:
        # hold out ground truth in the original data for evaluation
        train_X_ori = train_X
        val_X_ori = val_X
        test_X_ori = test_X

        # mask values in the train set to keep the same with below validation and test sets
        train_X = create_missingness(train_X, rate, pattern, **kwargs)
        # mask values in the validation set as ground truth
        val_X = create_missingness(val_X, rate, pattern, **kwargs)
        # mask values in the test set as ground truth
        test_X = create_missingness(test_X, rate, pattern, **kwargs)

        processed_dataset["train_X"] = train_X
        processed_dataset["train_X_ori"] = train_X_ori

        processed_dataset["val_X"] = val_X
        processed_dataset["val_X_ori"] = val_X_ori

        processed_dataset["test_X"] = test_X
        processed_dataset["test_X_ori"] = test_X_ori
    else:
        logger.warning("rate is 0, no missing values are artificially added.")

    print_final_dataset_info(train_X, val_X, test_X)
    return processed_dataset

1	"""	5✔
2	Preprocessing func for the dataset Solar Alabama.
3
4	"""
5
6	# Created by Wenjie Du <wenjay.du@gmail.com>
7	# License: BSD-3-Clause
8
9	import pandas as pd	5✔
10	import tsdb	5✔
11	from sklearn.preprocessing import StandardScaler	5✔
12
13	from ..utils.logging import logger, print_final_dataset_info	5✔
14	from ..utils.missingness import create_missingness	5✔
15	from ..utils.sliding import sliding_window	5✔
16
17
18	def preprocess_solar_alabama(	5✔
19	rate,
20	n_steps,
21	pattern: str = "point",
22	**kwargs,
23	) -> dict:
24	"""Load and preprocess the dataset Solar Alabama.
25
26	Parameters
27	----------
28	rate:
29	The missing rate.
30
31	n_steps:
32	The number of time steps to in the generated data samples.
33	Also the window size of the sliding window.
34
35	pattern:
36	The missing pattern to apply to the dataset.
37	Must be one of ['point', 'subseq', 'block'].
38
39	Returns
40	-------
41	processed_dataset :
42	A dictionary containing the processed Solar Alabama.
43	"""
44
45	assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"	×
46	assert n_steps > 0, f"sample_n_steps must be larger than 0, but got {n_steps}"	×
47
48	# read the raw data
49	data = tsdb.load("solar_alabama")	×
50	df = data["X"]	×
51
52	feature_names = df.columns.tolist()	×
53	feature_names.remove("date")	×
54	df["date"] = pd.to_datetime(df["date"])	×
55
56	unique_months = df["date"].dt.to_period("M").unique()	×
57	selected_as_train = unique_months[:6] # use the first 6 months as train set	×
58	logger.info(f"months selected as train set are {selected_as_train}")	×
59	selected_as_val = unique_months[6:9] # select the following 3 months as val set	×
60	logger.info(f"months selected as val set are {selected_as_val}")	×
61	selected_as_test = unique_months[9:] # select the left 3 months as test set	×
62	logger.info(f"months selected as test set are {selected_as_test}")	×
63
64	test_set = df[df["date"].dt.to_period("M").isin(selected_as_test)]	×
65	val_set = df[df["date"].dt.to_period("M").isin(selected_as_val)]	×
66	train_set = df[df["date"].dt.to_period("M").isin(selected_as_train)]	×
67
68	scaler = StandardScaler()	×
69	train_X = scaler.fit_transform(train_set.loc[:, feature_names])	×
70	val_X = scaler.transform(val_set.loc[:, feature_names])	×
71	test_X = scaler.transform(test_set.loc[:, feature_names])	×
72
73	train_X = sliding_window(train_X, n_steps)	×
74	val_X = sliding_window(val_X, n_steps)	×
75	test_X = sliding_window(test_X, n_steps)	×
76
77	# assemble the final processed data into a dictionary
78	processed_dataset = {	×
79	# general info
80	"n_steps": n_steps,
81	"n_features": train_X.shape[-1],
82	"scaler": scaler,
83	# train set
84	"train_X": train_X,
85	# val set
86	"val_X": val_X,
87	# test set
88	"test_X": test_X,
89	}
90
91	if rate > 0:	×
92	# hold out ground truth in the original data for evaluation
93	train_X_ori = train_X	×
94	val_X_ori = val_X	×
95	test_X_ori = test_X	×
96
97	# mask values in the train set to keep the same with below validation and test sets
98	train_X = create_missingness(train_X, rate, pattern, **kwargs)	×
99	# mask values in the validation set as ground truth
100	val_X = create_missingness(val_X, rate, pattern, **kwargs)	×
101	# mask values in the test set as ground truth
102	test_X = create_missingness(test_X, rate, pattern, **kwargs)	×
103
104	processed_dataset["train_X"] = train_X	×
105	processed_dataset["train_X_ori"] = train_X_ori	×
106
107	processed_dataset["val_X"] = val_X	×
108	processed_dataset["val_X_ori"] = val_X_ori	×
109
110	processed_dataset["test_X"] = test_X	×
UNCOV 111	processed_dataset["test_X_ori"] = test_X_ori	×
112	else:
113	logger.warning("rate is 0, no missing values are artificially added.")	×
114
115	print_final_dataset_info(train_X, val_X, test_X)	×
116	return processed_dataset	×

WenjieDu / BenchPOTS / 10832566580

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous