• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rafaelpadilla / 3W / 24912462866

24 Apr 2026 09:21PM UTC coverage: 76.362% (-3.1%) from 79.464%
24912462866

push

github

web-flow
Merge pull request #73 from rafaelpadilla/eduardo/refactor_data_operations

Refactor of data operations, trainers and models.

244 of 339 branches covered (71.98%)

Branch coverage included in aggregate %.

1317 of 1706 new or added lines in 50 files covered. (77.2%)

28 existing lines in 5 files now uncovered.

2124 of 2762 relevant lines covered (76.9%)

0.77 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.44
/toolkit/ThreeWToolkit/preprocessing/clean_signals.py
1
from pydantic import Field, PrivateAttr
1✔
2

3
import numpy as np
1✔
4
import pandas as pd
1✔
5

6
from ..core.base_dataset import BaseDataset
1✔
7
from ..core.dataset_outputs import DatasetOutputs
1✔
8
from ..core.base_preprocessing import (
1✔
9
    BasePreprocessing,
10
    BasePreprocessingConfig,
11
)
12

13
from ..dataset.transformed_dataset import TransformedDataset
1✔
14

15
_3W_CATEGORICAL_FEATURES = (
1✔
16
    [  # List of categorical features to exclude from cleaning by default
17
        "ESTADO-DHSV",
18
        "ESTADO-M1",
19
        "ESTADO-M2",
20
        "ESTADO-PXO",
21
        "ESTADO-SDV-GL",
22
        "ESTADO-SDV-P",
23
        "ESTADO-W1",
24
        "ESTADO-W2",
25
        "ESTADO-XO",
26
        "state",
27
    ]
28
)
29

30

31
class CleanSignalsConfig(BasePreprocessingConfig):
1✔
32
    """Configuration for identifying and cleaning frozen or out-of-range signals using IQR thresholds."""
33

34
    average_iqr_threshold: float = Field(
1✔
35
        default=3.0,
36
        gt=0.0,
37
        description="IQR threshold for average values. Signals below this may be considered frozen.",
38
    )
39

40
    std_iqr_threshold: float = Field(
1✔
41
        default=3.0,
42
        gt=0.0,
43
        description="IQR threshold for standard deviation. Signals below this may be considered frozen.",
44
    )
45

46
    absolute_std_threshold: float | None = Field(
1✔
47
        default=1e-6,
48
        gt=0.0,
49
        description="Absolute standard deviation threshold for frozen detection. Set to None to disable.",
50
    )
51

52
    missing_column_threshold: float = Field(
1✔
53
        default=0.6,
54
        description="Drop columns that are all-NaN in more than this fraction of events.",
55
    )
56

57
    exclude_features: list[str] = Field(
1✔
58
        default=_3W_CATEGORICAL_FEATURES,
59
        description="Feature names to exclude from cleaning. Categorical features left unchanged.",
60
    )
61

62
    _target: type = PrivateAttr(default_factory=lambda: CleanSignals)
1✔
63

64

65
class CleanSignals(BasePreprocessing):
1✔
66
    """
67
    Feature extractor for cleaning possibly frozen or out-of-range signals.
68
    """
69

70
    def __init__(self, config: CleanSignalsConfig):
1✔
71
        """
72
        Initializes the CleanSignals feature extractor with the given configuration.
73

74
        Args:
75
            config: CleanSignalsConfig object containing the IQR thresholds and other parameters for cleaning.
76
        """
77
        self.config: CleanSignalsConfig = config
1✔
78

79
        self.average_bounds: tuple[pd.Series, pd.Series] | None = None
1✔
80
        self.std_bounds: tuple[pd.Series, pd.Series] | None = None
1✔
81

82
        self.drop_list: list[str] = []
1✔
83

84
    def fit(self, data: BaseDataset) -> None:
1✔
85
        """
86
        Fit the feature extractor to the data.
87
        This method computes the necessary statistics from the input dataset to determine the safe ranges for the
88
        signals.
89

90
        Args:
91
            data (DatasetOutputs): The input dataset outputs to fit on.
92
        """
93
        # Compute distribution of means and std along the dataset
94
        self._fit_iqr_thresholds(data)
1✔
95

96
        # apply cleaning and fit columns thresholding
97
        cleaned_data = TransformedDataset(data, self._filter_iqr_bounds)
1✔
98
        self._fit_missing_thresholds(cleaned_data)
1✔
99

100
    def _fit_iqr_thresholds(self, data: BaseDataset) -> None:
1✔
101
        """Compute the IQR-based thresholds for average and std of the signals based on the training data.
102
        This method computes the average and standard deviation for each signal across all events in the dataset,
103
        and then determines the IQR-based thresholds for identifying out-of-range signals.
104

105
        Args:
106
            data (BaseDataset): The input dataset to compute the thresholds from.
107
        """
108
        _averages = []
1✔
109
        _stds = []
1✔
110
        for event in data:
1✔
111
            _averages.append(event.signal.mean())
1✔
112
            _stds.append(event.signal.std())
1✔
113
        averages = pd.concat(_averages, axis=1).transpose()
1✔
114
        stds = pd.concat(_stds, axis=1).transpose()
1✔
115

116
        # compute quantiles
117
        average_quantiles = (averages.quantile(0.25), averages.quantile(0.75))
1✔
118
        average_iqr = average_quantiles[1] - average_quantiles[0]
1✔
119
        self.average_bounds = (
1✔
120
            average_quantiles[0] - self.config.average_iqr_threshold * average_iqr,
121
            average_quantiles[1] + self.config.average_iqr_threshold * average_iqr,
122
        )
123

124
        std_quantiles = (stds.quantile(0.25), stds.quantile(0.75))
1✔
125
        std_iqr = std_quantiles[1] - std_quantiles[0]
1✔
126

127
        # take into account absolute std threshold when computing std bounds
128
        lower_std_bound = std_quantiles[0] - self.config.std_iqr_threshold * std_iqr
1✔
129
        if self.config.absolute_std_threshold is not None:
1✔
130
            lower_std_bound = lower_std_bound.clip(
1✔
131
                lower=self.config.absolute_std_threshold
132
            )
133
        self.std_bounds = (
1✔
134
            lower_std_bound,
135
            std_quantiles[1] + self.config.std_iqr_threshold * std_iqr,
136
        )
137

138
    def _fit_missing_thresholds(self, data: BaseDataset) -> None:
1✔
139
        """
140
        Compute the list of columns to drop based on the fraction of all-NaN values across events in the dataset.
141

142
        Args:
143
            data (BaseDataset): The input dataset to compute the missing column thresholds from.
144
        """
145
        _all_nans = []
1✔
146
        for event in data:
1✔
147
            _all_nans.append(event.signal.isna().all())
1✔
148
        all_nans = pd.concat(_all_nans, axis=1).transpose()
1✔
149

150
        all_nan_percentage = all_nans.mean()
1✔
151
        drop_cols = all_nan_percentage >= self.config.missing_column_threshold
1✔
152
        self.drop_list = drop_cols[drop_cols].index.tolist()
1✔
153

154
    def _filter_iqr_bounds(self, data: DatasetOutputs) -> DatasetOutputs:
1✔
155
        """
156
        Filter out signals that are outside the IQR-based thresholds by replacing them with NaN values.
157

158
        Args:
159
            data (DatasetOutputs): The input dataset outputs to filter.
160
        Returns:
161
            DatasetOutputs with out-of-range signals replaced by NaN values."""
162
        if self.average_bounds is None or self.std_bounds is None:
1✔
NEW
163
            raise ValueError(
×
164
                "The CleanSignals feature extractor must be fitted before calling transform."
165
            )
166

167
        signal = data.signal
1✔
168

169
        signal_average = signal.mean()
1✔
170
        signal_std = signal.std()
1✔
171

172
        # identify signals that are outside the IQR-based thresholds
173
        drop_average = (signal_average < self.average_bounds[0]) | (
1✔
174
            signal_average > self.average_bounds[1]
175
        )
176
        drop_std = (signal_std < self.std_bounds[0]) | (signal_std > self.std_bounds[1])
1✔
177

178
        drop = drop_average | drop_std
1✔
179
        removed_columns = drop[drop].index.tolist()
1✔
180

181
        # filter out removed columns based on exclude_features list in config
182
        removed_columns = [
1✔
183
            col for col in removed_columns if col not in self.config.exclude_features
184
        ]
185

186
        # replace out-of-range signals with NaN values
187
        signal = signal.assign(**{col: np.nan for col in removed_columns})
1✔
188

189
        return DatasetOutputs(signal=signal, label=data.label, metadata=data.metadata)
1✔
190

191
    def _filter_missing_cols(self, data: DatasetOutputs) -> DatasetOutputs:
1✔
192
        """Filter out columns that are all-NaN in more than the specified fraction of events by dropping them from the signal.
193

194
        Args:
195
            data (DatasetOutputs): The input dataset outputs to filter.
196
        Returns: DatasetOutputs with columns dropped according to the missing column threshold.
197
        """
198
        if self.drop_list is None:
1✔
NEW
199
            raise RuntimeError(
×
200
                "The CleanSignals feature extractor must be fitted before calling transform."
201
            )
202

203
        removed_columns = [
1✔
204
            col for col in self.drop_list if col not in self.config.exclude_features
205
        ]
206
        signal = data.signal.drop(columns=removed_columns)
1✔
207

208
        return DatasetOutputs(signal=signal, label=data.label, metadata=data.metadata)
1✔
209

210
    def transform(self, data: DatasetOutputs) -> DatasetOutputs:
1✔
211
        """Apply the cleaning transformations to the input dataset outputs.
212
        Args:
213
            data (DatasetOutputs): The input dataset outputs to transform.
214
        Returns: DatasetOutputs with cleaned signals according to the fitted thresholds.
215
        """
216
        data = self._filter_iqr_bounds(data)
1✔
217
        data = self._filter_missing_cols(data)
1✔
218
        return data
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc