• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tonegas / nnodely / 14319828903

07 Apr 2025 09:27PM UTC coverage: 97.259% (+0.2%) from 97.035%
14319828903

Pull #86

github

web-flow
Merge 44b7c25ee into e9c323c4f
Pull Request #86: Smallclasses

2275 of 2409 new or added lines in 54 files covered. (94.44%)

1 existing line in 1 file now uncovered.

11637 of 11965 relevant lines covered (97.26%)

0.97 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

73.37
/nnodely/operators/loader.py
1
import os, random
1✔
2

3
import pandas as pd
1✔
4
import numpy as np
1✔
5
import pandas.api.types as ptypes
1✔
6
from collections.abc import Sequence, Callable
1✔
7

8
from nnodely.support.utils import check, log, enforce_types
1✔
9

10
class Loader:
1✔
11
    def __init__(self):
1✔
12
        check(type(self) is not Loader, TypeError, "Loader class cannot be instantiated directly")
1✔
13

14
        # Dataaset Parameters
15
        self.__n_datasets = 0
1✔
16
        self.__datasets_loaded = set()
1✔
17

18
        self._data_loaded = False
1✔
19
        self._file_count = 0
1✔
20
        self._num_of_samples = {}
1✔
21
        self._data = {}
1✔
22
        self._multifile = {}
1✔
23

24
    @enforce_types
1✔
25
    def getSamples(self, dataset:str, index:int|None = None, window:int=1) -> dict:
1✔
26
        """
27
        Retrieves a window of samples from a given dataset.
28

29
        Parameters
30
        ----------
31
        dataset : str
32
            The name of the dataset to retrieve samples from.
33
        index : int, optional
34
            The starting index of the samples. If None, a random index is chosen. Default is None.
35
        window : int, optional
36
            The number of consecutive samples to retrieve. Default is 1.
37

38
        Returns
39
        -------
40
        dict
41
            A dictionary containing the retrieved samples. The keys are input and state names, and the values are lists of samples.
42

43
        Raises
44
        ------
45
        ValueError
46
            If the dataset is not loaded.
47

48
        Examples
49
        --------
50
        .. image:: https://colab.research.google.com/assets/colab-badge.svg
51
            :target: https://colab.research.google.com/github/tonegas/nnodely/blob/main/examples/dataset.ipynb
52
            :alt: Open in Colab
53

54
        Example usage:
55
            >>> model = Modely()
56
            >>> model.loadData('dataset_name')
57
            >>> samples = model.getSamples('dataset_name', index=10, window=5)
58
        """
59
        if index is None:
1✔
60
            index = random.randint(0, self._num_of_samples[dataset] - window)
1✔
61
        check(self._data_loaded, ValueError, 'The Dataset must first be loaded using <loadData> function!')
1✔
62
        if self._data_loaded:
1✔
63
            result_dict = {}
1✔
64
            for key in (self._model_def['Inputs'].keys() | self._model_def['States'].keys()):
1✔
65
                result_dict[key] = []
1✔
66
            for idx in range(window):
1✔
67
                for key ,samples in self._data[dataset].items():
1✔
68
                    if key in (self._model_def['Inputs'].keys() | self._model_def['States'].keys()):
1✔
69
                        result_dict[key].append(samples[index+idx])
1✔
70
            return result_dict
1✔
71

72
    @enforce_types
1✔
73
    def filterData(self, filter_function:Callable, dataset_name:str|None = None) -> None:
1✔
74
        """
75
        Filters the data in the dataset using the provided filter function.
76

77
        Parameters
78
        ----------
79
        filter_function : Callable
80
            A function that takes a sample as input and returns True if the sample should be kept, and False if it should be removed.
81
        dataset_name : str or None, optional
82
            The name of the dataset to filter. If None, all datasets are filtered. Default is None.
83

84
        Examples
85
        --------
86
        .. image:: https://colab.research.google.com/assets/colab-badge.svg
87
            :target: https://colab.research.google.com/github/tonegas/nnodely/blob/main/examples/dataset.ipynb
88
            :alt: Open in Colab
89

90
        Example usage:
91
            >>> model = Modely()
92
            >>> model.loadData('dataset_name', 'path/to/data')
93
            >>> def filter_fn(sample):
94
            >>>     return sample['input1'] > 0
95
            >>> model.filterData(filter_fn, 'dataset_name')
96
        """
NEW
97
        idx_to_remove = []
×
NEW
98
        if dataset_name is None:
×
NEW
99
            for name in self._data.keys():
×
NEW
100
                dataset = self._data[name]
×
NEW
101
                n_samples = len(dataset[list(dataset.keys())[0]])
×
102

NEW
103
                data_for_filter = []
×
NEW
104
                for i in range(n_samples):
×
NEW
105
                    new_sample = {key: val[i] for key, val in dataset.items()}
×
NEW
106
                    data_for_filter.append(new_sample)
×
107

NEW
108
                for idx, sample in enumerate(data_for_filter):
×
NEW
109
                    if not filter_function(sample):
×
NEW
110
                        idx_to_remove.append(idx)
×
111

NEW
112
                for key in self._data[name].keys():
×
NEW
113
                    self._data[name][key] = np.delete(self._data[name][key], idx_to_remove, axis=0)
×
NEW
114
                    self._num_of_samples[name] = self._data[name][key].shape[0]
×
NEW
115
                self.visualizer.showDataset(name=name)
×
116

117
        else:
NEW
118
            dataset = self._data[dataset_name]
×
NEW
119
            n_samples = len(dataset[list(dataset.keys())[0]])
×
120

NEW
121
            data_for_filter = []
×
NEW
122
            for i in range(n_samples):
×
NEW
123
                new_sample = {key: val[i] for key, val in dataset.items()}
×
NEW
124
                data_for_filter.append(new_sample)
×
125

NEW
126
            for idx, sample in enumerate(data_for_filter):
×
NEW
127
                if not filter_function(sample):
×
NEW
128
                    idx_to_remove.append(idx)
×
129

NEW
130
            for key in self._data[dataset_name].keys():
×
NEW
131
                self._data[dataset_name][key] = np.delete(self._data[dataset_name][key], idx_to_remove, axis=0)
×
NEW
132
                self._num_of_samples[dataset_name] = self._data[dataset_name][key].shape[0]
×
NEW
133
            self.visualizer.showDataset(name=dataset_name)
×
134

135
    @enforce_types
1✔
136
    def loadData(self, name:str,
1✔
137
                 source: str | dict | pd.DataFrame,
138
                 format: list | None = None,
139
                 skiplines: int = 0,
140
                 delimiter: str = ',',
141
                 header: int | str | Sequence | None = None,
142
                 resampling: bool = False
143
                 ) -> None:
144
        """
145
        Loads data into the model. The data can be loaded from a directory path containing the csv files or from a crafted dataset.
146

147
        Parameters
148
        ----------
149
        name : str
150
            The name of the dataset.
151
        source : str or list or pd.DataFrame
152
            The source of the data. Can be a directory path containing the csv files or a list of custom data.
153
        format : list or None, optional
154
            The format of the data. When loading multiple csv files the format parameter will define how to read each column of the file. Default is None.
155
        skiplines : int, optional
156
            The number of lines to skip at the beginning of the file. Default is 0.
157
        delimiter : str, optional
158
            The delimiter used in the data files. Default is ','.
159
        header : list or None, optional
160
            The header of the data files. Default is None.
161

162
        Raises
163
        ------
164
        ValueError
165
            If the network is not neuralized.
166
            If the delimiter is not valid.
167

168
        Examples
169
        --------
170
        .. image:: https://colab.research.google.com/assets/colab-badge.svg
171
            :target: https://colab.research.google.com/github/tonegas/nnodely/blob/main/examples/dataset.ipynb
172
            :alt: Open in Colab
173

174
        Example - load data from files:
175
            >>> x = Input('x')
176
            >>> y = Input('y')
177
            >>> out = Output('out',Fir(x.tw(0.05)))
178
            >>> test = Modely(visualizer=None)
179
            >>> test.addModel('example_model', out)
180
            >>> test.neuralizeModel(0.01)
181
            >>> data_struct = ['x', '', 'y']
182
            >>> test.loadData(name='example_dataset', source='path/to/data', format=data_struct)
183

184
        Example - load data from a crafted dataset:
185
            >>> x = Input('x')
186
            >>> y = Input('y')
187
            >>> out = Output('out',Fir(x.tw(0.05)))
188
            >>> test = Modely(visualizer=None)
189
            >>> test.addModel('example_model', out)
190
            >>> test.neuralizeModel(0.01)
191
            >>> data_x = np.array(range(10))
192
            >>> dataset = {'x': data_x, 'y': (2*data_x)}
193
            >>> test.loadData(name='example_dataset',source=dataset)
194
        """
195
        check(self.neuralized, ValueError, "The network is not neuralized.")
1✔
196
        check(delimiter in ['\t', '\n', ';', ',', ' '], ValueError, 'delimiter not valid!')
1✔
197

198
        json_inputs = self._model_def['Inputs'] | self._model_def['States']
1✔
199
        model_inputs = list(json_inputs.keys())
1✔
200
        ## Initialize the dictionary containing the data
201
        if name in list(self._data.keys()):
1✔
202
            log.warning(f'Dataset named {name} already loaded! overriding the existing one..')
1✔
203
        self._data[name] = {}
1✔
204

205
        num_of_samples = {}
1✔
206
        if type(source) is str:  ## we have a directory path containing the files
1✔
207
            ## collect column indexes
208
            format_idx = {}
1✔
209
            idx = 0
1✔
210
            for item in format:
1✔
211
                if isinstance(item, tuple):
1✔
NEW
212
                    for key in item:
×
NEW
213
                        if key not in model_inputs:
×
NEW
214
                            idx += 1
×
NEW
215
                            break
×
NEW
216
                        n_cols = json_inputs[key]['dim']
×
NEW
217
                        format_idx[key] = (idx, idx + n_cols)
×
NEW
218
                    idx += n_cols
×
219
                else:
220
                    if item not in model_inputs:
1✔
221
                        idx += 1
1✔
222
                        continue
1✔
223
                    n_cols = json_inputs[item]['dim']
1✔
224
                    format_idx[item] = (idx, idx + n_cols)
1✔
225
                    idx += n_cols
1✔
226

227
            ## Initialize each input key
228
            for key in format_idx.keys():
1✔
229
                self._data[name][key] = []
1✔
230

231
            ## obtain the file names
232
            try:
1✔
233
                _, _, files = next(os.walk(source))
1✔
234
                files.sort()
1✔
NEW
235
            except StopIteration as e:
×
NEW
236
                check(False, StopIteration, f'ERROR: The path "{source}" does not exist!')
×
NEW
237
                return
×
238
            self._file_count = len(files)
1✔
239
            if self._file_count > 1:  ## Multifile
1✔
240
                self._multifile[name] = []
1✔
241

242
            ## Cycle through all the files
243
            for file in files:
1✔
244
                try:
1✔
245
                    ## read the csv
246
                    df = pd.read_csv(os.path.join(source, file), skiprows=skiplines, delimiter=delimiter, header=header)
1✔
NEW
247
                except:
×
NEW
248
                    log.warning(f'Cannot read file {os.path.join(source, file)}')
×
NEW
249
                    continue
×
250
                if self._file_count > 1:
1✔
251
                    self._multifile[name].append(
1✔
252
                        (self._multifile[name][-1] + (len(df) - self._max_n_samples + 1)) if self._multifile[name] else len(
253
                            df) - self._max_n_samples + 1)
254
                ## Cycle through all the windows
255
                for key, idxs in format_idx.items():
1✔
256
                    back, forw = self._input_ns_backward[key], self._input_ns_forward[key]
1✔
257
                    ## Save as numpy array the data
258
                    data = df.iloc[:, idxs[0]:idxs[1]].to_numpy()
1✔
259
                    self._data[name][key] += [data[i - back:i + forw] for i in
1✔
260
                                              range(self._max_samples_backward, len(df) - self._max_samples_forward + 1)]
261

262
            ## Stack the files
263
            for key in format_idx.keys():
1✔
264
                self._data[name][key] = np.stack(self._data[name][key])
1✔
265
                num_of_samples[key] = self._data[name][key].shape[0]
1✔
266

267
        elif type(source) is dict:  ## we have a crafted dataset
1✔
268
            self._file_count = 1
1✔
269

270
            ## Check if the inputs are correct
271
            # assert set(model_inputs).issubset(source.keys()), f'The dataset is missing some inputs. Inputs needed for the model: {model_inputs}'
272

273
            # Merge a list of inputs into a single dictionary
274
            for key in model_inputs:
1✔
275
                if key not in source.keys():
1✔
276
                    continue
1✔
277

278
                self._data[name][key] = []  ## Initialize the dataset
1✔
279

280
                back, forw = self._input_ns_backward[key], self._input_ns_forward[key]
1✔
281
                for idx in range(len(source[key]) - self._max_n_samples + 1):
1✔
282
                    self._data[name][key].append(
1✔
283
                        source[key][idx + (self._max_samples_backward - back):idx + (self._max_samples_backward + forw)])
284

285
            ## Stack the files
286
            for key in model_inputs:
1✔
287
                if key not in source.keys():
1✔
288
                    continue
1✔
289
                self._data[name][key] = np.stack(self._data[name][key])
1✔
290
                if self._data[name][key].ndim == 2:  ## Add the sample dimension
1✔
291
                    self._data[name][key] = np.expand_dims(self._data[name][key], axis=-1)
1✔
292
                if self._data[name][key].ndim > 3:
1✔
NEW
293
                    self._data[name][key] = np.squeeze(self._data[name][key], axis=1)
×
294
                num_of_samples[key] = self._data[name][key].shape[0]
1✔
295

296
        elif isinstance(source, pd.DataFrame):  ## we have a crafted dataset
1✔
297
            self._file_count = 1
1✔
298

299
            ## Resampling if the time column is provided (must be a Datetime object)
300
            if resampling:
1✔
301
                if type(source.index) is pd.DatetimeIndex:
1✔
302
                    source = source.resample(f"{int(self._model_def.getSampleTime()  * 1e9)}ns").interpolate(method="linear")
1✔
303
                elif 'time' in source.columns:
1✔
304
                    if not ptypes.is_datetime64_any_dtype(source['time']):
1✔
305
                        source['time'] = pd.to_datetime(source['time'], unit='s')
1✔
306
                    source = source.set_index('time', drop=True)
1✔
307
                    source = source.resample(f"{int(self._model_def.getSampleTime() * 1e9)}ns").interpolate(method="linear")
1✔
308
                else:
309
                    raise TypeError(
1✔
310
                        "No time column found in the DataFrame. Please provide a time column for resampling.")
311

312
            processed_data = {}
1✔
313
            for key in model_inputs:
1✔
314
                if key not in source.columns:
1✔
NEW
315
                    continue
×
316

317
                processed_data[key] = []  ## Initialize the dataset
1✔
318
                back, forw = self._input_ns_backward[key], self._input_ns_forward[key]
1✔
319

320
                for idx in range(len(source) - self._max_n_samples + 1):
1✔
321
                    window = source[key].iloc[idx + (self._max_samples_backward - back):idx + (self._max_samples_backward + forw)]
1✔
322
                    processed_data[key].append(window.to_numpy())
1✔
323

324
            ## Convert lists to numpy arrays
325
            for key in processed_data:
1✔
326
                processed_data[key] = np.stack(processed_data[key])
1✔
327
                if json_inputs[key]['dim'] > 1:
1✔
328
                    processed_data[key] = np.array(processed_data[key].tolist(), dtype=np.float64)
1✔
329
                if processed_data[key].ndim == 2:  ## Add the sample dimension
1✔
330
                    processed_data[key] = np.expand_dims(processed_data[key], axis=-1)
1✔
331
                if processed_data[key].ndim > 3:
1✔
NEW
332
                    processed_data[key] = np.squeeze(processed_data[key], axis=1)
×
333
                num_of_samples[key] = processed_data[key].shape[0]
1✔
334

335
            self._data[name] = processed_data
1✔
336

337
        # Check dim of the samples
338
        check(len(set(num_of_samples.values())) == 1, ValueError,
1✔
339
              f"The number of the sample of the dataset {name} are not the same for all input in the dataset: {num_of_samples}")
340
        self._num_of_samples[name] = num_of_samples[list(num_of_samples.keys())[0]]
1✔
341

342
        ## Set the Loaded flag to True
343
        self._data_loaded = True
1✔
344
        ## Update the number of datasets loaded
345
        self.__n_datasets = len(self._data.keys())
1✔
346
        self.__datasets_loaded.add(name)
1✔
347
        ## Show the dataset
348
        self.visualizer.showDataset(name=name)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc