• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Project-OSmOSE / OSEkit / 21402000858

27 Jan 2026 02:56PM UTC coverage: 98.876% (+0.02%) from 98.861%
21402000858

Pull #330

github

web-flow
Merge 3fe889fc7 into 22c8efe15
Pull Request #330: [DRAFT] Data populated ratio

85 of 85 new or added lines in 6 files covered. (100.0%)

1 existing line in 1 file now uncovered.

4750 of 4804 relevant lines covered (98.88%)

0.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.46
/src/osekit/core_api/base_dataset.py
1
"""``BaseDataset``: Base class for the Dataset objects.
2

3
Datasets are collections of Data, with methods
4
that simplify repeated operations on the data.
5
"""
6

7
from __future__ import annotations
1✔
8

9
import os
1✔
10
from abc import ABC, abstractmethod
1✔
11
from bisect import bisect
1✔
12
from pathlib import Path
1✔
13
from typing import TYPE_CHECKING, Literal, Self, TypeVar
1✔
14

15
from pandas import Timedelta, Timestamp, date_range
1✔
16
from soundfile import LibsndfileError
1✔
17
from tqdm import tqdm
1✔
18

19
from osekit.config import TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED
1✔
20
from osekit.config import global_logging_context as glc
1✔
21
from osekit.core_api.base_data import BaseData
1✔
22
from osekit.core_api.base_file import BaseFile
1✔
23
from osekit.core_api.event import Event
1✔
24
from osekit.core_api.json_serializer import deserialize_json, serialize_json
1✔
25
from osekit.utils.timestamp_utils import last_window_end
1✔
26

27
if TYPE_CHECKING:
28
    import pytz
29

30
TData = TypeVar("TData", bound=BaseData)
1✔
31
TFile = TypeVar("TFile", bound=BaseFile)
1✔
32

33

34
class BaseDataset[TData: BaseData, TFile: BaseFile](Event, ABC):
1✔
35
    """Base class for Dataset objects.
36

37
    Datasets are collections of Data, with methods
38
    that simplify repeated operations on the data.
39
    """
40

41
    file_cls: type[TFile]
1✔
42

43
    def __init__(
1✔
44
        self,
45
        data: list[TData],
46
        name: str | None = None,
47
        suffix: str = "",
48
        folder: Path | None = None,
49
    ) -> None:
50
        """Instantiate a Dataset object from the Data objects."""
51
        self.data = data
1✔
52
        self._name = name
1✔
53
        self._has_default_name = name is None
1✔
54
        self._suffix = suffix
1✔
55
        self._folder = folder
1✔
56

57
    def __str__(self) -> str:
1✔
58
        """Overwrite __str__."""
59
        return self.name
1✔
60

61
    def __eq__(self, other: BaseDataset) -> bool:
1✔
62
        """Overwrite __eq__."""
63
        return sorted(self.data, key=lambda e: (e.begin, e.end)) == sorted(
1✔
64
            other.data,
65
            key=lambda e: (e.begin, e.end),
66
        )
67

68
    @property
1✔
69
    def base_name(self) -> str:
1✔
70
        """Name of the dataset without suffix."""
71
        return (
1✔
72
            self.begin.strftime(TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED)
73
            if self._name is None
74
            else self._name
75
        )
76

77
    @base_name.setter
1✔
78
    def base_name(self, name: str) -> None:
1✔
79
        self._name = name
1✔
80

81
    @property
1✔
82
    def name(self) -> str:
1✔
83
        """Name of the dataset with suffix."""
84
        return self.base_name if not self.suffix else f"{self.base_name}_{self.suffix}"
1✔
85

86
    @name.setter
1✔
87
    def name(self, name: str | None) -> None:
1✔
88
        self._name = name
1✔
89

90
    @property
1✔
91
    def suffix(self) -> str:
1✔
92
        """Suffix that is applied to the name of the ads.
93

94
        This is used by the public API, for suffixing multiple core_api datasets
95
        that are created simultaneously and share the same namewith their specific type,
96
        e.g. ``_audio`` or ``_spectro``.
97
        """
98
        return self._suffix
1✔
99

100
    @suffix.setter
1✔
101
    def suffix(self, suffix: str | None) -> None:
1✔
102
        self._suffix = suffix
1✔
103

104
    @property
1✔
105
    def has_default_name(self) -> bool:
1✔
106
        """Return ``True`` if the dataset has a default name, ``False`` if it has a given name."""
107
        return self._has_default_name
1✔
108

109
    @property
1✔
110
    def begin(self) -> Timestamp:
1✔
111
        """Begin of the first data object."""
112
        return min(data.begin for data in self.data)
1✔
113

114
    @property
1✔
115
    def end(self) -> Timestamp:
1✔
116
        """End of the last data object."""
117
        return max(data.end for data in self.data)
1✔
118

119
    @property
1✔
120
    def files(self) -> set[TFile]:
1✔
121
        """All files referred to by the Dataset."""
122
        return {file for data in self.data for file in data.files}
1✔
123

124
    @property
1✔
125
    def folder(self) -> Path:
1✔
126
        """Folder in which the dataset files are located or to be written."""
127
        return (
1✔
128
            self._folder
129
            if self._folder is not None
130
            else next(iter(file.path.parent for file in self.files), None)
131
        )
132

133
    @folder.setter
1✔
134
    def folder(self, folder: Path) -> None:
1✔
135
        """Set the folder in which the dataset files might be written.
136

137
        Parameters
138
        ----------
139
        folder: Path
140
            The folder in which the dataset files might be written.
141

142
        """
143
        self._folder = folder
1✔
144

145
    def move_files(self, folder: Path) -> None:
1✔
146
        """Move the dataset files to the destination folder.
147

148
        Parameters
149
        ----------
150
        folder: Path
151
            Destination folder in which the dataset files will be moved.
152

153
        """
154
        for file in tqdm(
1✔
155
            self.files,
156
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
157
        ):
158
            file.move(folder)
1✔
159
        self._folder = folder
1✔
160

161
    @property
1✔
162
    def data_duration(self) -> Timedelta:
1✔
163
        """Return the most frequent duration among the data of this dataset.
164

165
        The duration is rounded to the nearest second.
166

167
        """
168
        data_durations = [
1✔
169
            Timedelta(data.duration).round(freq="1s") for data in self.data
170
        ]
171
        return max(set(data_durations), key=data_durations.count)
1✔
172

173
    def remove_empty_data(self, threshold: float = 0.0) -> None:
1✔
174
        """Remove data that has less than ``threshold`` % of non-empty duration.
175

176
        Parameters
177
        ----------
178
        threshold: float
179
            Threshold percentage of emptiness duration under which the
180
            data should be removed.
181
            Must be in the ``[0.,1.]`` interval.
182

183
        """
184
        if not 0.0 <= threshold <= 1.0:
1✔
185
            msg = f"Threshold should be between 0 and 1. Got {threshold}"
1✔
186
            raise ValueError(msg)
1✔
187
        self.data = [data for data in self.data if data.populated_ratio > threshold]
1✔
188

189
    def write(
1✔
190
        self,
191
        folder: Path,
192
        first: int = 0,
193
        last: int | None = None,
194
        *,
195
        link: bool = False,
196
    ) -> None:
197
        """Write all data objects in the specified folder.
198

199
        Parameters
200
        ----------
201
        folder: Path
202
            Folder in which to write the data.
203
        link: bool
204
            If ``True``, the Data will be bound to the written file.
205
            Its items will be replaced with a single item, which will match the whole
206
            new File.
207
        first: int
208
            Index of the first data object to write.
209
        last: int | None
210
            Index after the last data object to write.
211

212
        """
213
        last = len(self.data) if last is None else last
1✔
214
        for data in tqdm(
1✔
215
            self.data[first:last],
216
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
217
        ):
218
            data.write(folder=folder, link=link)
1✔
219

220
    def to_dict(self) -> dict:
1✔
221
        """Serialize a ``BaseDataset`` to a dictionary.
222

223
        Returns
224
        -------
225
        dict:
226
            The serialized dictionary representing the ``BaseDataset``.
227

228
        """
229
        return {
1✔
230
            "data": {str(d): d.to_dict() for d in self.data},
231
            "name": self._name,
232
            "suffix": self.suffix,
233
            "folder": str(self.folder),
234
        }
235

236
    @classmethod
1✔
237
    def from_dict(cls, dictionary: dict) -> Self:
1✔
238
        """Deserialize a ``BaseDataset`` from a dictionary.
239

240
        Parameters
241
        ----------
242
        dictionary: dict
243
            The serialized dictionary representing the ``BaseDataset``.
244

245
        Returns
246
        -------
247
        AudioData
248
            The deserialized ``BaseDataset``.
249

250
        """
251
        data = cls._data_from_dict(dictionary["data"])
1✔
252
        name = dictionary["name"]
1✔
253
        suffix = dictionary["suffix"]
1✔
254
        folder = Path(dictionary["folder"])
1✔
255
        return cls(data=data, name=name, suffix=suffix, folder=folder)
1✔
256

257
    @classmethod
1✔
258
    @abstractmethod
1✔
259
    def _data_from_dict(cls, dictionary: dict) -> list[TData]:
1✔
260
        """Return a list of Data from a serialized dictionary."""
261
        ...
262

263
    def write_json(self, folder: Path) -> None:
1✔
264
        """Write a serialized ``BaseDataset`` to a JSON file."""
265
        serialize_json(folder / f"{self.name}.json", self.to_dict())
1✔
266

267
    @classmethod
1✔
268
    def from_json(cls, file: Path) -> Self:
1✔
269
        """Deserialize a ``BaseDataset`` from a JSON file.
270

271
        Parameters
272
        ----------
273
        file: Path
274
            Path to the serialized JSON file representing the ``BaseDataset``.
275

276
        Returns
277
        -------
278
        BaseDataset
279
            The deserialized ``BaseDataset``.
280

281
        """
UNCOV
282
        return cls.from_dict(deserialize_json(file))
×
283

284
    @classmethod
1✔
285
    def from_files(  # noqa: PLR0913
1✔
286
        cls,
287
        files: list[TFile],
288
        begin: Timestamp | None = None,
289
        end: Timestamp | None = None,
290
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
291
        data_duration: Timedelta | None = None,
292
        overlap: float = 0.0,
293
        name: str | None = None,
294
        **kwargs,  # noqa: ANN003
295
    ) -> Self:
296
        """Return a Dataset object from a list of Files.
297

298
        Parameters
299
        ----------
300
        files: list[TFile]
301
            The list of files contained in the Dataset.
302
        begin: Timestamp | None
303
            Begin of the first data object.
304
            Defaulted to the begin of the first file.
305
        end: Timestamp | None
306
            End of the last data object.
307
            Defaulted to the end of the last file.
308
        mode: Literal["files", "timedelta_total", "timedelta_file"]
309
            Mode of creation of the dataset data from the original files.
310
            ``"files"``: one data will be created for each file.
311
            ``"timedelta_total"``: data objects of duration equal to ``data_duration`` will
312
            be created from the ``begin`` timestamp to the ``end`` timestamp.
313
            ``"timedelta_file"``: data objects of duration equal to ``data_duration`` will
314
            be created from the beginning of the first file that the ``begin`` timestamp
315
            is into, until it would resume in a data beginning between two files.
316
            Then, the next data object will be created from the
317
            beginning of the next original file and so on.
318
        data_duration: Timedelta | None
319
            Duration of the data objects.
320
            If mode is set to ``"files"``, this parameter has no effect.
321
            If provided, data will be evenly distributed between ``begin`` and ``end``.
322
            Else, one data object will cover the whole time period.
323
        overlap: float
324
            Overlap percentage between consecutive data.
325
        name: str|None
326
            Name of the dataset.
327
        kwargs:
328
            Keyword arguments to pass to the ``cls.data_from_files()`` method.
329

330
        Returns
331
        -------
332
        Self:
333
            The Dataset object.
334

335
        """
336
        if mode == "files":
1✔
337
            data = [cls._data_from_files([f], **kwargs) for f in files]
1✔
338
            data = BaseData.remove_overlaps(data)
1✔
339
            return cls(data=data, name=name)
1✔
340

341
        if not begin:
1✔
342
            begin = min(file.begin for file in files)
1✔
343
        if not end:
1✔
344
            end = max(file.end for file in files)
1✔
345
        if data_duration:
1✔
346
            data_base = (
1✔
347
                cls._get_data_from_files_timedelta_total(
348
                    begin=begin,
349
                    end=end,
350
                    data_duration=data_duration,
351
                    files=files,
352
                    overlap=overlap,
353
                    **kwargs,
354
                )
355
                if mode == "timedelta_total"
356
                else cls._get_data_from_files_timedelta_file(
357
                    begin=begin,
358
                    end=end,
359
                    data_duration=data_duration,
360
                    files=files,
361
                    overlap=overlap,
362
                    **kwargs,
363
                )
364
            )
365
        else:
366
            data_base = [
1✔
367
                cls._data_from_files(files=files, begin=begin, end=end, **kwargs),
368
            ]
369
        return cls(data_base, name=name)
1✔
370

371
    @classmethod
1✔
372
    @abstractmethod
1✔
373
    def _data_from_files(
1✔
374
        cls,
375
        files: list[TFile],
376
        begin: Timestamp | None = None,
377
        end: Timestamp | None = None,
378
        name: str | None = None,
379
        **kwargs,  # noqa: ANN003
380
    ) -> TData:
381
        """Return a base Dataset object between two timestamps from a list of Files."""
382
        ...
383

384
    @classmethod
1✔
385
    def _get_data_from_files_timedelta_total(
1✔
386
        cls,
387
        begin: Timestamp,
388
        end: Timestamp,
389
        data_duration: Timedelta,
390
        files: list[TFile],
391
        overlap: float = 0,
392
        **kwargs,  # noqa: ANN003
393
    ) -> list[TData]:
394
        if not 0 <= overlap < 1:
1✔
395
            msg = f"Overlap ({overlap}) must be between 0 and 1."
1✔
396
            raise ValueError(msg)
1✔
397

398
        active_file_index = 0
1✔
399
        output = []
1✔
400
        files = sorted(files, key=lambda f: f.begin)
1✔
401
        freq = data_duration * (1 - overlap)
1✔
402

403
        for data_begin in tqdm(
1✔
404
            date_range(begin, end, freq=freq, inclusive="left"),
405
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
406
        ):
407
            data_end = Timestamp(data_begin + data_duration)
1✔
408
            while (
1✔
409
                active_file_index < len(files)
410
                and files[active_file_index].end < data_begin
411
            ):
412
                active_file_index += 1
1✔
413
            last_active_file_index = active_file_index
1✔
414
            while (
1✔
415
                last_active_file_index < len(files)
416
                and files[last_active_file_index].begin < data_end
417
            ):
418
                last_active_file_index += 1
1✔
419
            output.append(
1✔
420
                cls._data_from_files(
421
                    files[active_file_index:last_active_file_index],
422
                    data_begin,
423
                    data_end,
424
                    **kwargs,
425
                ),
426
            )
427

428
        return output
1✔
429

430
    @classmethod
1✔
431
    def _get_data_from_files_timedelta_file(
1✔
432
        cls,
433
        begin: Timestamp,
434
        end: Timestamp,
435
        data_duration: Timedelta,
436
        files: list[TFile],
437
        overlap: float = 0,
438
        **kwargs,
439
    ) -> list[TData]:
440
        if not 0 <= overlap < 1:
1✔
441
            msg = f"Overlap ({overlap}) must be between 0 and 1."
1✔
442
            raise ValueError(msg)
1✔
443

444
        files = sorted(files, key=lambda file: file.begin)
1✔
445
        first = max(0, bisect(files, begin, key=lambda f: f.begin) - 1)
1✔
446
        last = bisect(files, end, key=lambda f: f.begin)
1✔
447

448
        data_hop = data_duration * (1 - overlap)
1✔
449

450
        output = []
1✔
451
        files_chunk = []
1✔
452
        for idx, file in tqdm(
1✔
453
            enumerate(files[first:last]),
454
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
455
        ):
456
            if file in files_chunk:
1✔
457
                continue
1✔
458
            files_chunk = [file]
1✔
459

460
            for next_file in files[idx + 1 :]:
1✔
461
                upper_data_limit = last_window_end(
1✔
462
                    begin=file.begin,
463
                    end=files_chunk[-1].end,
464
                    window_hop=data_hop,
465
                    window_duration=data_duration,
466
                )
467
                if upper_data_limit < next_file.begin:
1✔
468
                    break
1✔
469
                files_chunk.append(next_file)
1✔
470

471
            output.extend(
1✔
472
                cls._data_from_files(
473
                    files,
474
                    data_begin,
475
                    data_begin + data_duration,
476
                    **kwargs,
477
                )
478
                for data_begin in date_range(
479
                    file.begin,
480
                    files_chunk[-1].end,
481
                    freq=data_hop,
482
                    inclusive="left",
483
                )
484
            )
485

486
        return output
1✔
487

488
    @classmethod
1✔
489
    def from_folder(  # noqa: PLR0913
1✔
490
        cls: type[Self],
491
        folder: Path,
492
        strptime_format: str | None,
493
        begin: Timestamp | None = None,
494
        end: Timestamp | None = None,
495
        timezone: str | pytz.timezone | None = None,
496
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
497
        overlap: float = 0.0,
498
        data_duration: Timedelta | None = None,
499
        first_file_begin: Timestamp | None = None,
500
        name: str | None = None,
501
        **kwargs,  # noqa: ANN003
502
    ) -> Self:
503
        """Return a Dataset from a folder containing the base files.
504

505
        Parameters
506
        ----------
507
        folder: Path
508
            The folder containing the files.
509
        strptime_format: str | None
510
            The strptime format used in the filenames.
511
            It should use valid strftime codes (https://strftime.org/).
512
            If None, the first audio file of the folder will start
513
            at ``first_file_begin``, and each following file will start
514
            at the end of the previous one.
515
        begin: Timestamp | None
516
            The begin of the dataset.
517
            Defaulted to the begin of the first file.
518
        end: Timestamp | None
519
            The end of the dataset.
520
            Defaulted to the end of the last file.
521
        timezone: str | pytz.timezone | None
522
            The timezone in which the file should be localized.
523
            If None, the file begin/end will be tz-naive.
524
            If different from a timezone parsed from the filename, the timestamps'
525
            timezone will be converted from the parsed timezone
526
            to the specified timezone.
527
        mode: Literal["files", "timedelta_total", "timedelta_file"]
528
            Mode of creation of the dataset data from the original files.
529
            ``"files"``: one data will be created for each file.
530
            ``"timedelta_total"``: data objects of duration equal to ``data_duration`` will
531
            be created from the ``begin`` timestamp to the ``end`` timestamp.
532
            ``"timedelta_file"``: data objects of duration equal to ``data_duration`` will
533
            be created from the beginning of the first file that the ``begin`` timestamp is into, until it would resume
534
            in a data beginning between two files. Then, the next data object will be created from the
535
            beginning of the next original file and so on.
536
        overlap: float
537
            Overlap percentage between consecutive data.
538
        data_duration: Timedelta | None
539
            Duration of the data objects.
540
            If mode is set to ``"files"``, this parameter has no effect.
541
            If provided, data will be evenly distributed between ``begin`` and ``end``.
542
            Else, one object will cover the whole time period.
543
        first_file_begin: Timestamp | None
544
            Timestamp of the first audio file being processed.
545
            Will be ignored if ``striptime_format`` is specified.
546
        name: str|None
547
            Name of the dataset.
548
        kwargs:
549
            Keyword arguments to pass to the ``cls.from_files()`` method.
550

551
        Returns
552
        -------
553
        Self:
554
            The dataset.
555

556
        """
557
        valid_files = []
1✔
558
        rejected_files = []
1✔
559
        first_file_begin = first_file_begin or Timestamp("2020-01-01 00:00:00")
1✔
560
        for file in tqdm(
1✔
561
            sorted(folder.iterdir()),
562
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
563
        ):
564
            is_file_ok = cls._parse_file(
1✔
565
                file=file,
566
                strptime_format=strptime_format,
567
                timezone=timezone,
568
                begin_timestamp=first_file_begin,
569
                valid_files=valid_files,
570
                rejected_files=rejected_files,
571
            )
572
            if is_file_ok:
1✔
573
                first_file_begin += valid_files[-1].duration
1✔
574

575
        if rejected_files:
1✔
576
            rejected_files = "\n\t".join(f.name for f in rejected_files)
1✔
577
            glc.logger.warning(
1✔
578
                f"The following files couldn't be parsed:\n\t{rejected_files}",
579
            )
580

581
        if not valid_files:
1✔
582
            msg = f"No valid file found in {folder}"
1✔
583
            raise FileNotFoundError(msg)
1✔
584

585
        return cls.from_files(
1✔
586
            files=valid_files,
587
            begin=begin,
588
            end=end,
589
            mode=mode,
590
            overlap=overlap,
591
            data_duration=data_duration,
592
            name=name,
593
            **kwargs,
594
        )
595

596
    @classmethod
1✔
597
    def _parse_file(
1✔
598
        cls: type[Self],
599
        file: Path,
600
        strptime_format: str,
601
        timezone: str | pytz.timezone | None,
602
        begin_timestamp: Timestamp,
603
        valid_files: list[TFile],
604
        rejected_files: list[Path],
605
    ) -> bool:
606
        if file.suffix.lower() not in cls.file_cls.supported_extensions:
1✔
607
            return False
1✔
608
        try:
1✔
609
            if strptime_format is None:
1✔
610
                f = cls.file_cls(file, begin=begin_timestamp, timezone=timezone)
1✔
611
            else:
612
                f = cls.file_cls(
1✔
613
                    file,
614
                    strptime_format=strptime_format,
615
                    timezone=timezone,
616
                )
617
            valid_files.append(f)
1✔
618
        except (ValueError, LibsndfileError):
1✔
619
            rejected_files.append(file)
1✔
620
            return False
1✔
621
        else:
622
            return True
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc