• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Project-OSmOSE / OSEkit / 19471577242

18 Nov 2025 03:30PM UTC coverage: 96.898% (+4.3%) from 92.572%
19471577242

Pull #281

github

web-flow
Merge 1b5cc878a into f45ff2fad
Pull Request #281: Job rework

567 of 572 new or added lines in 6 files covered. (99.13%)

22 existing lines in 3 files now uncovered.

3873 of 3997 relevant lines covered (96.9%)

0.97 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.13
/src/osekit/core_api/base_dataset.py
1
"""BaseDataset: Base class for the Dataset objects.
2

3
Datasets are collections of Data, with methods
4
that simplify repeated operations on the data.
5
"""
6

7
from __future__ import annotations
1✔
8

9
import os
1✔
10
from bisect import bisect
1✔
11
from pathlib import Path
1✔
12
from typing import TYPE_CHECKING, Generic, Literal, TypeVar
1✔
13

14
from pandas import Timedelta, Timestamp, date_range
1✔
15
from soundfile import LibsndfileError
1✔
16
from tqdm import tqdm
1✔
17

18
from osekit.config import TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED
1✔
19
from osekit.config import global_logging_context as glc
1✔
20
from osekit.core_api.base_data import BaseData
1✔
21
from osekit.core_api.base_file import BaseFile
1✔
22
from osekit.core_api.event import Event
1✔
23
from osekit.core_api.json_serializer import deserialize_json, serialize_json
1✔
24
from osekit.utils.timestamp_utils import last_window_end
1✔
25

26
if TYPE_CHECKING:
27
    import pytz
28

29
TData = TypeVar("TData", bound=BaseData)
1✔
30
TFile = TypeVar("TFile", bound=BaseFile)
1✔
31

32

33
class BaseDataset(Generic[TData, TFile], Event):
1✔
34
    """Base class for Dataset objects.
35

36
    Datasets are collections of Data, with methods
37
    that simplify repeated operations on the data.
38
    """
39

40
    def __init__(
1✔
41
        self,
42
        data: list[TData],
43
        name: str | None = None,
44
        suffix: str = "",
45
        folder: Path | None = None,
46
    ) -> None:
47
        """Instantiate a Dataset object from the Data objects."""
48
        self.data = data
1✔
49
        self._name = name
1✔
50
        self._has_default_name = name is None
1✔
51
        self._suffix = suffix
1✔
52
        self._folder = folder
1✔
53

54
    def __str__(self) -> str:
1✔
55
        """Overwrite __str__."""
56
        return self.name
1✔
57

58
    def __eq__(self, other: BaseDataset) -> bool:
1✔
59
        """Overwrite __eq__."""
60
        return sorted(self.data, key=lambda e: (e.begin, e.end)) == sorted(
1✔
61
            other.data,
62
            key=lambda e: (e.begin, e.end),
63
        )
64

65
    @property
1✔
66
    def base_name(self) -> str:
1✔
67
        """Name of the dataset without suffix."""
68
        return (
1✔
69
            self.begin.strftime(TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED)
70
            if self._name is None
71
            else self._name
72
        )
73

74
    @base_name.setter
1✔
75
    def base_name(self, name: str) -> None:
1✔
76
        self._name = name
1✔
77

78
    @property
1✔
79
    def name(self) -> str:
1✔
80
        """Name of the dataset with suffix."""
81
        return self.base_name if not self.suffix else f"{self.base_name}_{self.suffix}"
1✔
82

83
    @name.setter
1✔
84
    def name(self, name: str | None) -> None:
1✔
85
        self._name = name
1✔
86

87
    @property
1✔
88
    def suffix(self) -> str:
1✔
89
        """Suffix that is applied to the name of the ads.
90

91
        This is used by the public API, for suffixing multiple core_api datasets
92
        that are created simultaneously and share the same namewith their specific type,
93
         e.g. _audio or _spectro.
94
        """
95
        return self._suffix
1✔
96

97
    @suffix.setter
1✔
98
    def suffix(self, suffix: str | None) -> None:
1✔
99
        self._suffix = suffix
1✔
100

101
    @property
1✔
102
    def has_default_name(self) -> bool:
1✔
103
        """Return True if the dataset has a default name, False if it has a given name."""
104
        return self._has_default_name
1✔
105

106
    @property
1✔
107
    def begin(self) -> Timestamp:
1✔
108
        """Begin of the first data object."""
109
        return min(data.begin for data in self.data)
1✔
110

111
    @property
1✔
112
    def end(self) -> Timestamp:
1✔
113
        """End of the last data object."""
114
        return max(data.end for data in self.data)
1✔
115

116
    @property
1✔
117
    def files(self) -> set[TFile]:
1✔
118
        """All files referred to by the Dataset."""
119
        return {file for data in self.data for file in data.files}
1✔
120

121
    @property
1✔
122
    def folder(self) -> Path:
1✔
123
        """Folder in which the dataset files are located or to be written."""
124
        return (
1✔
125
            self._folder
126
            if self._folder is not None
127
            else next(iter(file.path.parent for file in self.files), None)
128
        )
129

130
    @folder.setter
1✔
131
    def folder(self, folder: Path) -> None:
1✔
132
        """Set the folder in which the dataset files might be written.
133

134
        Parameters
135
        ----------
136
        folder: Path
137
            The folder in which the dataset files might be written.
138

139
        """
140
        self._folder = folder
1✔
141

142
    def move_files(self, folder: Path) -> None:
1✔
143
        """Move the dataset files to the destination folder.
144

145
        Parameters
146
        ----------
147
        folder: Path
148
            Destination folder in which the dataset files will be moved.
149

150
        """
151
        for file in tqdm(self.files, disable=os.environ.get("DISABLE_TQDM", "")):
1✔
152
            file.move(folder)
1✔
153
        self._folder = folder
1✔
154

155
    @property
1✔
156
    def data_duration(self) -> Timedelta:
1✔
157
        """Return the most frequent duration among durations of the data of this dataset, rounded to the nearest second."""
158
        data_durations = [
1✔
159
            Timedelta(data.duration).round(freq="1s") for data in self.data
160
        ]
161
        return max(set(data_durations), key=data_durations.count)
1✔
162

163
    def write(
1✔
164
        self,
165
        folder: Path,
166
        link: bool = False,
167
        first: int = 0,
168
        last: int | None = None,
169
    ) -> None:
170
        """Write all data objects in the specified folder.
171

172
        Parameters
173
        ----------
174
        folder: Path
175
            Folder in which to write the data.
176
        link: bool
177
            If True, the Data will be bound to the written file.
178
            Its items will be replaced with a single item, which will match the whole
179
            new File.
180
        first: int
181
            Index of the first data object to write.
182
        last: int | None
183
            Index after the last data object to write.
184

185
        """
186
        last = len(self.data) if last is None else last
1✔
187
        for data in tqdm(
1✔
188
            self.data[first:last],
189
            disable=os.environ.get("DISABLE_TQDM", ""),
190
        ):
191
            data.write(folder=folder, link=link)
1✔
192

193
    def to_dict(self) -> dict:
1✔
194
        """Serialize a BaseDataset to a dictionary.
195

196
        Returns
197
        -------
198
        dict:
199
            The serialized dictionary representing the BaseDataset.
200

201
        """
202
        return {
1✔
203
            "data": {str(d): d.to_dict() for d in self.data},
204
            "name": self._name,
205
            "suffix": self.suffix,
206
            "folder": str(self.folder),
207
        }
208

209
    @classmethod
1✔
210
    def from_dict(cls, dictionary: dict) -> BaseDataset:
1✔
211
        """Deserialize a BaseDataset from a dictionary.
212

213
        Parameters
214
        ----------
215
        dictionary: dict
216
            The serialized dictionary representing the BaseData.
217

218
        Returns
219
        -------
220
        AudioData
221
            The deserialized BaseDataset.
222

223
        """
224
        return cls(
×
225
            [BaseData.from_dict(d) for d in dictionary["data"].values()],
226
            name=dictionary["name"],
227
            suffix=dictionary["suffix"],
228
            folder=Path(dictionary["folder"]),
229
        )
230

231
    def write_json(self, folder: Path) -> None:
1✔
232
        """Write a serialized BaseDataset to a JSON file."""
233
        serialize_json(folder / f"{self.name}.json", self.to_dict())
1✔
234

235
    @classmethod
1✔
236
    def from_json(cls, file: Path) -> BaseDataset:
1✔
237
        """Deserialize a BaseDataset from a JSON file.
238

239
        Parameters
240
        ----------
241
        file: Path
242
            Path to the serialized JSON file representing the BaseDataset.
243

244
        Returns
245
        -------
246
        BaseDataset
247
            The deserialized BaseDataset.
248

249
        """
250
        return cls.from_dict(deserialize_json(file))
×
251

252
    @classmethod
1✔
253
    def from_files(  # noqa: PLR0913
1✔
254
        cls,
255
        files: list[TFile],
256
        begin: Timestamp | None = None,
257
        end: Timestamp | None = None,
258
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
259
        data_duration: Timedelta | None = None,
260
        overlap: float = 0.0,
261
        name: str | None = None,
262
    ) -> BaseDataset:
263
        """Return a base BaseDataset object from a list of Files.
264

265
        Parameters
266
        ----------
267
        files: list[TFile]
268
            The list of files contained in the Dataset.
269
        begin: Timestamp | None
270
            Begin of the first data object.
271
            Defaulted to the begin of the first file.
272
        end: Timestamp | None
273
            End of the last data object.
274
            Defaulted to the end of the last file.
275
        mode: Literal["files", "timedelta_total", "timedelta_file"]
276
            Mode of creation of the dataset data from the original files.
277
            "files": one data will be created for each file.
278
            "timedelta_total": data objects of duration equal to data_duration will
279
            be created from the begin timestamp to the end timestamp.
280
            "timedelta_file": data objects of duration equal to data_duration will
281
            be created from the beginning of the first file that the begin timestamp is into, until it would resume
282
            in a data beginning between two files. Then, the next data object will be created from the
283
            beginning of the next original file and so on.
284
        data_duration: Timedelta | None
285
            Duration of the data objects.
286
            If mode is set to "files", this parameter has no effect.
287
            If provided, data will be evenly distributed between begin and end.
288
            Else, one data object will cover the whole time period.
289
        overlap: float
290
            Overlap percentage between consecutive data.
291
        name: str|None
292
            Name of the dataset.
293

294
        Returns
295
        -------
296
        BaseDataset[TItem, TFile]:
297
        The DataBase object.
298

299
        """
300
        if mode == "files":
1✔
301
            data_base = [BaseData.from_files([f]) for f in files]
1✔
302
            data_base = BaseData.remove_overlaps(data_base)
1✔
303
            return cls(data=data_base, name=name)
1✔
304

305
        if not begin:
1✔
306
            begin = min(file.begin for file in files)
1✔
307
        if not end:
1✔
308
            end = max(file.end for file in files)
1✔
309
        if data_duration:
1✔
310
            data_base = (
1✔
311
                cls._get_base_data_from_files_timedelta_total(
312
                    begin=begin,
313
                    end=end,
314
                    data_duration=data_duration,
315
                    files=files,
316
                    overlap=overlap,
317
                )
318
                if mode == "timedelta_total"
319
                else cls._get_base_data_from_files_timedelta_file(
320
                    begin=begin,
321
                    end=end,
322
                    data_duration=data_duration,
323
                    files=files,
324
                    overlap=overlap,
325
                )
326
            )
327
        else:
328
            data_base = [BaseData.from_files(files, begin=begin, end=end)]
1✔
329
        return cls(data_base, name=name)
1✔
330

331
    @classmethod
1✔
332
    def _get_base_data_from_files_timedelta_total(
1✔
333
        cls,
334
        begin: Timestamp,
335
        end: Timestamp,
336
        data_duration: Timedelta,
337
        files: list[TFile],
338
        overlap: float = 0,
339
    ) -> list[BaseData]:
340
        if not 0 <= overlap < 1:
1✔
341
            msg = f"Overlap ({overlap}) must be between 0 and 1."
1✔
342
            raise ValueError(msg)
1✔
343

344
        active_file_index = 0
1✔
345
        output = []
1✔
346
        files = sorted(files, key=lambda f: f.begin)
1✔
347
        freq = data_duration * (1 - overlap)
1✔
348

349
        for data_begin in tqdm(
1✔
350
            date_range(begin, end, freq=freq, inclusive="left"),
351
            disable=os.environ.get("DISABLE_TQDM", ""),
352
        ):
353
            data_end = Timestamp(data_begin + data_duration)
1✔
354
            while (
1✔
355
                active_file_index < len(files)
356
                and files[active_file_index].end < data_begin
357
            ):
358
                active_file_index += 1
1✔
359
            last_active_file_index = active_file_index
1✔
360
            while (
1✔
361
                last_active_file_index < len(files)
362
                and files[last_active_file_index].begin < data_end
363
            ):
364
                last_active_file_index += 1
1✔
365
            output.append(
1✔
366
                BaseData.from_files(
367
                    files[active_file_index:last_active_file_index],
368
                    data_begin,
369
                    data_end,
370
                ),
371
            )
372

373
        return output
1✔
374

375
    @classmethod
1✔
376
    def _get_base_data_from_files_timedelta_file(
1✔
377
        cls,
378
        begin: Timestamp,
379
        end: Timestamp,
380
        data_duration: Timedelta,
381
        files: list[TFile],
382
        overlap: float = 0,
383
    ) -> list[BaseData]:
384
        if not 0 <= overlap < 1:
1✔
385
            msg = f"Overlap ({overlap}) must be between 0 and 1."
1✔
386
            raise ValueError(msg)
1✔
387

388
        files = sorted(files, key=lambda file: file.begin)
1✔
389
        first = max(0, bisect(files, begin, key=lambda f: f.begin) - 1)
1✔
390
        last = bisect(files, end, key=lambda f: f.begin)
1✔
391

392
        data_hop = data_duration * (1 - overlap)
1✔
393

394
        output = []
1✔
395
        files_chunk = []
1✔
396
        for idx, file in tqdm(
1✔
397
            enumerate(files[first:last]),
398
            disable=os.environ.get("DISABLE_TQDM", ""),
399
        ):
400
            if file in files_chunk:
1✔
401
                continue
1✔
402
            files_chunk = [file]
1✔
403

404
            for next_file in files[idx + 1 :]:
1✔
405
                upper_data_limit = last_window_end(
1✔
406
                    begin=file.begin,
407
                    end=files_chunk[-1].end,
408
                    window_hop=data_hop,
409
                    window_duration=data_duration,
410
                )
411
                if upper_data_limit < next_file.begin:
1✔
412
                    break
1✔
413
                files_chunk.append(next_file)
1✔
414

415
            output.extend(
1✔
416
                BaseData.from_files(files, data_begin, data_begin + data_duration)
417
                for data_begin in date_range(
418
                    file.begin,
419
                    files_chunk[-1].end,
420
                    freq=data_hop,
421
                    inclusive="left",
422
                )
423
            )
424

425
        return output
1✔
426

427
    @classmethod
1✔
428
    def from_folder(  # noqa: PLR0913
1✔
429
        cls,
430
        folder: Path,
431
        strptime_format: str,
432
        file_class: type[TFile] = BaseFile,
433
        supported_file_extensions: list[str] | None = None,
434
        begin: Timestamp | None = None,
435
        end: Timestamp | None = None,
436
        timezone: str | pytz.timezone | None = None,
437
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
438
        overlap: float = 0.0,
439
        data_duration: Timedelta | None = None,
440
        name: str | None = None,
441
    ) -> BaseDataset:
442
        """Return a BaseDataset from a folder containing the base files.
443

444
        Parameters
445
        ----------
446
        folder: Path
447
            The folder containing the files.
448
        strptime_format: str
449
            The strptime format of the timestamps in the file names.
450
        file_class: type[Tfile]
451
            Derived type of BaseFile used to instantiate the dataset.
452
        supported_file_extensions: list[str]
453
            List of supported file extensions for parsing TFiles.
454
        begin: Timestamp | None
455
            The begin of the dataset.
456
            Defaulted to the begin of the first file.
457
        end: Timestamp | None
458
            The end of the dataset.
459
            Defaulted to the end of the last file.
460
        timezone: str | pytz.timezone | None
461
            The timezone in which the file should be localized.
462
            If None, the file begin/end will be tz-naive.
463
            If different from a timezone parsed from the filename, the timestamps'
464
            timezone will be converted from the parsed timezone
465
            to the specified timezone.
466
        mode: Literal["files", "timedelta_total", "timedelta_file"]
467
            Mode of creation of the dataset data from the original files.
468
            "files": one data will be created for each file.
469
            "timedelta_total": data objects of duration equal to data_duration will
470
            be created from the begin timestamp to the end timestamp.
471
            "timedelta_file": data objects of duration equal to data_duration will
472
            be created from the beginning of the first file that the begin timestamp is into, until it would resume
473
            in a data beginning between two files. Then, the next data object will be created from the
474
            beginning of the next original file and so on.
475
        overlap: float
476
            Overlap percentage between consecutive data.
477
        data_duration: Timedelta | None
478
            Duration of the data objects.
479
            If mode is set to "files", this parameter has no effect.
480
            If provided, data will be evenly distributed between begin and end.
481
            Else, one object will cover the whole time period.
482
        name: str|None
483
            Name of the dataset.
484

485
        Returns
486
        -------
487
        Basedataset:
488
            The base dataset.
489

490
        """
491
        if supported_file_extensions is None:
1✔
UNCOV
492
            supported_file_extensions = []
×
493
        valid_files = []
1✔
494
        rejected_files = []
1✔
495
        for file in tqdm(folder.iterdir(), disable=os.environ.get("DISABLE_TQDM", "")):
1✔
496
            if file.suffix.lower() not in supported_file_extensions:
1✔
497
                continue
1✔
498
            try:
1✔
499
                f = file_class(file, strptime_format=strptime_format, timezone=timezone)
1✔
500
                valid_files.append(f)
1✔
501
            except (ValueError, LibsndfileError):
1✔
502
                rejected_files.append(file)
1✔
503

504
        if rejected_files:
1✔
505
            rejected_files = "\n\t".join(f.name for f in rejected_files)
1✔
506
            glc.logger.warning(
1✔
507
                f"The following files couldn't be parsed:\n\t{rejected_files}",
508
            )
509

510
        if not valid_files:
1✔
511
            raise FileNotFoundError(f"No valid file found in {folder}.")
1✔
512

513
        return BaseDataset.from_files(
1✔
514
            files=valid_files,
515
            begin=begin,
516
            end=end,
517
            mode=mode,
518
            overlap=overlap,
519
            data_duration=data_duration,
520
            name=name,
521
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc