• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

WenjieDu / BenchPOTS / 10832566580

12 Sep 2024 02:09PM UTC coverage: 77.075% (-0.4%) from 77.508%
10832566580

push

github

web-flow
Merge pull request #19 from WenjieDu/dev

Add random walk dataset and release v0.3

76 of 102 new or added lines in 3 files covered. (74.51%)

2 existing lines in 2 files now uncovered.

585 of 759 relevant lines covered (77.08%)

3.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

16.33
/benchpots/datasets/solar_alabama.py
1
"""
5✔
2
Preprocessing func for the dataset Solar Alabama.
3

4
"""
5

6
# Created by Wenjie Du <wenjay.du@gmail.com>
7
# License: BSD-3-Clause
8

9
import pandas as pd
5✔
10
import tsdb
5✔
11
from sklearn.preprocessing import StandardScaler
5✔
12

13
from ..utils.logging import logger, print_final_dataset_info
5✔
14
from ..utils.missingness import create_missingness
5✔
15
from ..utils.sliding import sliding_window
5✔
16

17

18
def preprocess_solar_alabama(
5✔
19
    rate,
20
    n_steps,
21
    pattern: str = "point",
22
    **kwargs,
23
) -> dict:
24
    """Load and preprocess the dataset Solar Alabama.
25

26
    Parameters
27
    ----------
28
    rate:
29
        The missing rate.
30

31
    n_steps:
32
        The number of time steps to in the generated data samples.
33
        Also the window size of the sliding window.
34

35
    pattern:
36
        The missing pattern to apply to the dataset.
37
        Must be one of ['point', 'subseq', 'block'].
38

39
    Returns
40
    -------
41
    processed_dataset :
42
        A dictionary containing the processed Solar Alabama.
43
    """
44

45
    assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"
×
46
    assert n_steps > 0, f"sample_n_steps must be larger than 0, but got {n_steps}"
×
47

48
    # read the raw data
49
    data = tsdb.load("solar_alabama")
×
50
    df = data["X"]
×
51

52
    feature_names = df.columns.tolist()
×
53
    feature_names.remove("date")
×
54
    df["date"] = pd.to_datetime(df["date"])
×
55

56
    unique_months = df["date"].dt.to_period("M").unique()
×
57
    selected_as_train = unique_months[:6]  # use the first 6 months as train set
×
58
    logger.info(f"months selected as train set are {selected_as_train}")
×
59
    selected_as_val = unique_months[6:9]  # select the following 3 months as val set
×
60
    logger.info(f"months selected as val set are {selected_as_val}")
×
61
    selected_as_test = unique_months[9:]  # select the left 3 months as test set
×
62
    logger.info(f"months selected as test set are {selected_as_test}")
×
63

64
    test_set = df[df["date"].dt.to_period("M").isin(selected_as_test)]
×
65
    val_set = df[df["date"].dt.to_period("M").isin(selected_as_val)]
×
66
    train_set = df[df["date"].dt.to_period("M").isin(selected_as_train)]
×
67

68
    scaler = StandardScaler()
×
69
    train_X = scaler.fit_transform(train_set.loc[:, feature_names])
×
70
    val_X = scaler.transform(val_set.loc[:, feature_names])
×
71
    test_X = scaler.transform(test_set.loc[:, feature_names])
×
72

73
    train_X = sliding_window(train_X, n_steps)
×
74
    val_X = sliding_window(val_X, n_steps)
×
75
    test_X = sliding_window(test_X, n_steps)
×
76

77
    # assemble the final processed data into a dictionary
78
    processed_dataset = {
×
79
        # general info
80
        "n_steps": n_steps,
81
        "n_features": train_X.shape[-1],
82
        "scaler": scaler,
83
        # train set
84
        "train_X": train_X,
85
        # val set
86
        "val_X": val_X,
87
        # test set
88
        "test_X": test_X,
89
    }
90

91
    if rate > 0:
×
92
        # hold out ground truth in the original data for evaluation
93
        train_X_ori = train_X
×
94
        val_X_ori = val_X
×
95
        test_X_ori = test_X
×
96

97
        # mask values in the train set to keep the same with below validation and test sets
98
        train_X = create_missingness(train_X, rate, pattern, **kwargs)
×
99
        # mask values in the validation set as ground truth
100
        val_X = create_missingness(val_X, rate, pattern, **kwargs)
×
101
        # mask values in the test set as ground truth
102
        test_X = create_missingness(test_X, rate, pattern, **kwargs)
×
103

104
        processed_dataset["train_X"] = train_X
×
105
        processed_dataset["train_X_ori"] = train_X_ori
×
106

107
        processed_dataset["val_X"] = val_X
×
108
        processed_dataset["val_X_ori"] = val_X_ori
×
109

110
        processed_dataset["test_X"] = test_X
×
UNCOV
111
        processed_dataset["test_X_ori"] = test_X_ori
×
112
    else:
113
        logger.warning("rate is 0, no missing values are artificially added.")
×
114

115
    print_final_dataset_info(train_X, val_X, test_X)
×
116
    return processed_dataset
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc