21402000858

Committed 27 Jan 2026 02:56PM UTC coverage: 98.876% (+0.02%) from 98.861%

Build # 21402000858

Build Type

Pull #330

github

Committed by

web-flow

Commit Message

Merge 3fe889fc7 into 22c8efe15

Pull Request Pull Request #330: [DRAFT] Data populated ratio

Coverage Stats

85 of 85 new or added lines in 6 files covered. (100.0%)

1 existing line in 1 file now uncovered.

4750 of 4804 relevant lines covered (98.88%)

0.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.46

/src/osekit/core_api/base_dataset.py

"""``BaseDataset``: Base class for the Dataset objects.

Datasets are collections of Data, with methods
that simplify repeated operations on the data.
"""

from __future__ import annotations

import os
from abc import ABC, abstractmethod
from bisect import bisect
from pathlib import Path
from typing import TYPE_CHECKING, Literal, Self, TypeVar

from pandas import Timedelta, Timestamp, date_range
from soundfile import LibsndfileError
from tqdm import tqdm

from osekit.config import TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED
from osekit.config import global_logging_context as glc
from osekit.core_api.base_data import BaseData
from osekit.core_api.base_file import BaseFile
from osekit.core_api.event import Event
from osekit.core_api.json_serializer import deserialize_json, serialize_json
from osekit.utils.timestamp_utils import last_window_end

if TYPE_CHECKING:
    import pytz

TData = TypeVar("TData", bound=BaseData)
TFile = TypeVar("TFile", bound=BaseFile)


class BaseDataset[TData: BaseData, TFile: BaseFile](Event, ABC):
    """Base class for Dataset objects.

    Datasets are collections of Data, with methods
    that simplify repeated operations on the data.
    """

    file_cls: type[TFile]

    def __init__(
        self,
        data: list[TData],
        name: str | None = None,
        suffix: str = "",
        folder: Path | None = None,
    ) -> None:
        """Instantiate a Dataset object from the Data objects."""
        self.data = data
        self._name = name
        self._has_default_name = name is None
        self._suffix = suffix
        self._folder = folder

    def __str__(self) -> str:
        """Overwrite __str__."""
        return self.name

    def __eq__(self, other: BaseDataset) -> bool:
        """Overwrite __eq__."""
        return sorted(self.data, key=lambda e: (e.begin, e.end)) == sorted(
            other.data,
            key=lambda e: (e.begin, e.end),
        )

    @property
    def base_name(self) -> str:
        """Name of the dataset without suffix."""
        return (
            self.begin.strftime(TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED)
            if self._name is None
            else self._name
        )

    @base_name.setter
    def base_name(self, name: str) -> None:
        self._name = name

    @property
    def name(self) -> str:
        """Name of the dataset with suffix."""
        return self.base_name if not self.suffix else f"{self.base_name}_{self.suffix}"

    @name.setter
    def name(self, name: str | None) -> None:
        self._name = name

    @property
    def suffix(self) -> str:
        """Suffix that is applied to the name of the ads.

        This is used by the public API, for suffixing multiple core_api datasets
        that are created simultaneously and share the same namewith their specific type,
        e.g. ``_audio`` or ``_spectro``.
        """
        return self._suffix

    @suffix.setter
    def suffix(self, suffix: str | None) -> None:
        self._suffix = suffix

    @property
    def has_default_name(self) -> bool:
        """Return ``True`` if the dataset has a default name, ``False`` if it has a given name."""
        return self._has_default_name

    @property
    def begin(self) -> Timestamp:
        """Begin of the first data object."""
        return min(data.begin for data in self.data)

    @property
    def end(self) -> Timestamp:
        """End of the last data object."""
        return max(data.end for data in self.data)

    @property
    def files(self) -> set[TFile]:
        """All files referred to by the Dataset."""
        return {file for data in self.data for file in data.files}

    @property
    def folder(self) -> Path:
        """Folder in which the dataset files are located or to be written."""
        return (
            self._folder
            if self._folder is not None
            else next(iter(file.path.parent for file in self.files), None)
        )

    @folder.setter
    def folder(self, folder: Path) -> None:
        """Set the folder in which the dataset files might be written.

        Parameters
        ----------
        folder: Path
            The folder in which the dataset files might be written.

        """
        self._folder = folder

    def move_files(self, folder: Path) -> None:
        """Move the dataset files to the destination folder.

        Parameters
        ----------
        folder: Path
            Destination folder in which the dataset files will be moved.

        """
        for file in tqdm(
            self.files,
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
        ):
            file.move(folder)
        self._folder = folder

    @property
    def data_duration(self) -> Timedelta:
        """Return the most frequent duration among the data of this dataset.

        The duration is rounded to the nearest second.

        """
        data_durations = [
            Timedelta(data.duration).round(freq="1s") for data in self.data
        ]
        return max(set(data_durations), key=data_durations.count)

    def remove_empty_data(self, threshold: float = 0.0) -> None:
        """Remove data that has less than ``threshold`` % of non-empty duration.

        Parameters
        ----------
        threshold: float
            Threshold percentage of emptiness duration under which the
            data should be removed.
            Must be in the ``[0.,1.]`` interval.

        """
        if not 0.0 <= threshold <= 1.0:
            msg = f"Threshold should be between 0 and 1. Got {threshold}"
            raise ValueError(msg)
        self.data = [data for data in self.data if data.populated_ratio > threshold]

    def write(
        self,
        folder: Path,
        first: int = 0,
        last: int | None = None,
        *,
        link: bool = False,
    ) -> None:
        """Write all data objects in the specified folder.

        Parameters
        ----------
        folder: Path
            Folder in which to write the data.
        link: bool
            If ``True``, the Data will be bound to the written file.
            Its items will be replaced with a single item, which will match the whole
            new File.
        first: int
            Index of the first data object to write.
        last: int | None
            Index after the last data object to write.

        """
        last = len(self.data) if last is None else last
        for data in tqdm(
            self.data[first:last],
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
        ):
            data.write(folder=folder, link=link)

    def to_dict(self) -> dict:
        """Serialize a ``BaseDataset`` to a dictionary.

        Returns
        -------
        dict:
            The serialized dictionary representing the ``BaseDataset``.

        """
        return {
            "data": {str(d): d.to_dict() for d in self.data},
            "name": self._name,
            "suffix": self.suffix,
            "folder": str(self.folder),
        }

    @classmethod
    def from_dict(cls, dictionary: dict) -> Self:
        """Deserialize a ``BaseDataset`` from a dictionary.

        Parameters
        ----------
        dictionary: dict
            The serialized dictionary representing the ``BaseDataset``.

        Returns
        -------
        AudioData
            The deserialized ``BaseDataset``.

        """
        data = cls._data_from_dict(dictionary["data"])
        name = dictionary["name"]
        suffix = dictionary["suffix"]
        folder = Path(dictionary["folder"])
        return cls(data=data, name=name, suffix=suffix, folder=folder)

    @classmethod
    @abstractmethod
    def _data_from_dict(cls, dictionary: dict) -> list[TData]:
        """Return a list of Data from a serialized dictionary."""
        ...

    def write_json(self, folder: Path) -> None:
        """Write a serialized ``BaseDataset`` to a JSON file."""
        serialize_json(folder / f"{self.name}.json", self.to_dict())

    @classmethod
    def from_json(cls, file: Path) -> Self:
        """Deserialize a ``BaseDataset`` from a JSON file.

        Parameters
        ----------
        file: Path
            Path to the serialized JSON file representing the ``BaseDataset``.

        Returns
        -------
        BaseDataset
            The deserialized ``BaseDataset``.

        """
        return cls.from_dict(deserialize_json(file))

    @classmethod
    def from_files(  # noqa: PLR0913
        cls,
        files: list[TFile],
        begin: Timestamp | None = None,
        end: Timestamp | None = None,
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
        data_duration: Timedelta | None = None,
        overlap: float = 0.0,
        name: str | None = None,
        **kwargs,  # noqa: ANN003
    ) -> Self:
        """Return a Dataset object from a list of Files.

        Parameters
        ----------
        files: list[TFile]
            The list of files contained in the Dataset.
        begin: Timestamp | None
            Begin of the first data object.
            Defaulted to the begin of the first file.
        end: Timestamp | None
            End of the last data object.
            Defaulted to the end of the last file.
        mode: Literal["files", "timedelta_total", "timedelta_file"]
            Mode of creation of the dataset data from the original files.
            ``"files"``: one data will be created for each file.
            ``"timedelta_total"``: data objects of duration equal to ``data_duration`` will
            be created from the ``begin`` timestamp to the ``end`` timestamp.
            ``"timedelta_file"``: data objects of duration equal to ``data_duration`` will
            be created from the beginning of the first file that the ``begin`` timestamp
            is into, until it would resume in a data beginning between two files.
            Then, the next data object will be created from the
            beginning of the next original file and so on.
        data_duration: Timedelta | None
            Duration of the data objects.
            If mode is set to ``"files"``, this parameter has no effect.
            If provided, data will be evenly distributed between ``begin`` and ``end``.
            Else, one data object will cover the whole time period.
        overlap: float
            Overlap percentage between consecutive data.
        name: str|None
            Name of the dataset.
        kwargs:
            Keyword arguments to pass to the ``cls.data_from_files()`` method.

        Returns
        -------
        Self:
            The Dataset object.

        """
        if mode == "files":
            data = [cls._data_from_files([f], **kwargs) for f in files]
            data = BaseData.remove_overlaps(data)
            return cls(data=data, name=name)

        if not begin:
            begin = min(file.begin for file in files)
        if not end:
            end = max(file.end for file in files)
        if data_duration:
            data_base = (
                cls._get_data_from_files_timedelta_total(
                    begin=begin,
                    end=end,
                    data_duration=data_duration,
                    files=files,
                    overlap=overlap,
                    **kwargs,
                )
                if mode == "timedelta_total"
                else cls._get_data_from_files_timedelta_file(
                    begin=begin,
                    end=end,
                    data_duration=data_duration,
                    files=files,
                    overlap=overlap,
                    **kwargs,
                )
            )
        else:
            data_base = [
                cls._data_from_files(files=files, begin=begin, end=end, **kwargs),
            ]
        return cls(data_base, name=name)

    @classmethod
    @abstractmethod
    def _data_from_files(
        cls,
        files: list[TFile],
        begin: Timestamp | None = None,
        end: Timestamp | None = None,
        name: str | None = None,
        **kwargs,  # noqa: ANN003
    ) -> TData:
        """Return a base Dataset object between two timestamps from a list of Files."""
        ...

    @classmethod
    def _get_data_from_files_timedelta_total(
        cls,
        begin: Timestamp,
        end: Timestamp,
        data_duration: Timedelta,
        files: list[TFile],
        overlap: float = 0,
        **kwargs,  # noqa: ANN003
    ) -> list[TData]:
        if not 0 <= overlap < 1:
            msg = f"Overlap ({overlap}) must be between 0 and 1."
            raise ValueError(msg)

        active_file_index = 0
        output = []
        files = sorted(files, key=lambda f: f.begin)
        freq = data_duration * (1 - overlap)

        for data_begin in tqdm(
            date_range(begin, end, freq=freq, inclusive="left"),
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
        ):
            data_end = Timestamp(data_begin + data_duration)
            while (
                active_file_index < len(files)
                and files[active_file_index].end < data_begin
            ):
                active_file_index += 1
            last_active_file_index = active_file_index
            while (
                last_active_file_index < len(files)
                and files[last_active_file_index].begin < data_end
            ):
                last_active_file_index += 1
            output.append(
                cls._data_from_files(
                    files[active_file_index:last_active_file_index],
                    data_begin,
                    data_end,
                    **kwargs,
                ),
            )

        return output

    @classmethod
    def _get_data_from_files_timedelta_file(
        cls,
        begin: Timestamp,
        end: Timestamp,
        data_duration: Timedelta,
        files: list[TFile],
        overlap: float = 0,
        **kwargs,
    ) -> list[TData]:
        if not 0 <= overlap < 1:
            msg = f"Overlap ({overlap}) must be between 0 and 1."
            raise ValueError(msg)

        files = sorted(files, key=lambda file: file.begin)
        first = max(0, bisect(files, begin, key=lambda f: f.begin) - 1)
        last = bisect(files, end, key=lambda f: f.begin)

        data_hop = data_duration * (1 - overlap)

        output = []
        files_chunk = []
        for idx, file in tqdm(
            enumerate(files[first:last]),
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
        ):
            if file in files_chunk:
                continue
            files_chunk = [file]

            for next_file in files[idx + 1 :]:
                upper_data_limit = last_window_end(
                    begin=file.begin,
                    end=files_chunk[-1].end,
                    window_hop=data_hop,
                    window_duration=data_duration,
                )
                if upper_data_limit < next_file.begin:
                    break
                files_chunk.append(next_file)

            output.extend(
                cls._data_from_files(
                    files,
                    data_begin,
                    data_begin + data_duration,
                    **kwargs,
                )
                for data_begin in date_range(
                    file.begin,
                    files_chunk[-1].end,
                    freq=data_hop,
                    inclusive="left",
                )
            )

        return output

    @classmethod
    def from_folder(  # noqa: PLR0913
        cls: type[Self],
        folder: Path,
        strptime_format: str | None,
        begin: Timestamp | None = None,
        end: Timestamp | None = None,
        timezone: str | pytz.timezone | None = None,
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
        overlap: float = 0.0,
        data_duration: Timedelta | None = None,
        first_file_begin: Timestamp | None = None,
        name: str | None = None,
        **kwargs,  # noqa: ANN003
    ) -> Self:
        """Return a Dataset from a folder containing the base files.

        Parameters
        ----------
        folder: Path
            The folder containing the files.
        strptime_format: str | None
            The strptime format used in the filenames.
            It should use valid strftime codes (https://strftime.org/).
            If None, the first audio file of the folder will start
            at ``first_file_begin``, and each following file will start
            at the end of the previous one.
        begin: Timestamp | None
            The begin of the dataset.
            Defaulted to the begin of the first file.
        end: Timestamp | None
            The end of the dataset.
            Defaulted to the end of the last file.
        timezone: str | pytz.timezone | None
            The timezone in which the file should be localized.
            If None, the file begin/end will be tz-naive.
            If different from a timezone parsed from the filename, the timestamps'
            timezone will be converted from the parsed timezone
            to the specified timezone.
        mode: Literal["files", "timedelta_total", "timedelta_file"]
            Mode of creation of the dataset data from the original files.
            ``"files"``: one data will be created for each file.
            ``"timedelta_total"``: data objects of duration equal to ``data_duration`` will
            be created from the ``begin`` timestamp to the ``end`` timestamp.
            ``"timedelta_file"``: data objects of duration equal to ``data_duration`` will
            be created from the beginning of the first file that the ``begin`` timestamp is into, until it would resume
            in a data beginning between two files. Then, the next data object will be created from the
            beginning of the next original file and so on.
        overlap: float
            Overlap percentage between consecutive data.
        data_duration: Timedelta | None
            Duration of the data objects.
            If mode is set to ``"files"``, this parameter has no effect.
            If provided, data will be evenly distributed between ``begin`` and ``end``.
            Else, one object will cover the whole time period.
        first_file_begin: Timestamp | None
            Timestamp of the first audio file being processed.
            Will be ignored if ``striptime_format`` is specified.
        name: str|None
            Name of the dataset.
        kwargs:
            Keyword arguments to pass to the ``cls.from_files()`` method.

        Returns
        -------
        Self:
            The dataset.

        """
        valid_files = []
        rejected_files = []
        first_file_begin = first_file_begin or Timestamp("2020-01-01 00:00:00")
        for file in tqdm(
            sorted(folder.iterdir()),
            disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
        ):
            is_file_ok = cls._parse_file(
                file=file,
                strptime_format=strptime_format,
                timezone=timezone,
                begin_timestamp=first_file_begin,
                valid_files=valid_files,
                rejected_files=rejected_files,
            )
            if is_file_ok:
                first_file_begin += valid_files[-1].duration

        if rejected_files:
            rejected_files = "\n\t".join(f.name for f in rejected_files)
            glc.logger.warning(
                f"The following files couldn't be parsed:\n\t{rejected_files}",
            )

        if not valid_files:
            msg = f"No valid file found in {folder}"
            raise FileNotFoundError(msg)

        return cls.from_files(
            files=valid_files,
            begin=begin,
            end=end,
            mode=mode,
            overlap=overlap,
            data_duration=data_duration,
            name=name,
            **kwargs,
        )

    @classmethod
    def _parse_file(
        cls: type[Self],
        file: Path,
        strptime_format: str,
        timezone: str | pytz.timezone | None,
        begin_timestamp: Timestamp,
        valid_files: list[TFile],
        rejected_files: list[Path],
    ) -> bool:
        if file.suffix.lower() not in cls.file_cls.supported_extensions:
            return False
        try:
            if strptime_format is None:
                f = cls.file_cls(file, begin=begin_timestamp, timezone=timezone)
            else:
                f = cls.file_cls(
                    file,
                    strptime_format=strptime_format,
                    timezone=timezone,
                )
            valid_files.append(f)
        except (ValueError, LibsndfileError):
            rejected_files.append(file)
            return False
        else:
            return True

1	"""``BaseDataset``: Base class for the Dataset objects.
2
3	Datasets are collections of Data, with methods
4	that simplify repeated operations on the data.
5	"""
6
7	from __future__ import annotations	1✔
8
9	import os	1✔
10	from abc import ABC, abstractmethod	1✔
11	from bisect import bisect	1✔
12	from pathlib import Path	1✔
13	from typing import TYPE_CHECKING, Literal, Self, TypeVar	1✔
14
15	from pandas import Timedelta, Timestamp, date_range	1✔
16	from soundfile import LibsndfileError	1✔
17	from tqdm import tqdm	1✔
18
19	from osekit.config import TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED	1✔
20	from osekit.config import global_logging_context as glc	1✔
21	from osekit.core_api.base_data import BaseData	1✔
22	from osekit.core_api.base_file import BaseFile	1✔
23	from osekit.core_api.event import Event	1✔
24	from osekit.core_api.json_serializer import deserialize_json, serialize_json	1✔
25	from osekit.utils.timestamp_utils import last_window_end	1✔
26
27	if TYPE_CHECKING:
28	import pytz
29
30	TData = TypeVar("TData", bound=BaseData)	1✔
31	TFile = TypeVar("TFile", bound=BaseFile)	1✔
32
33
34	class BaseDataset[TData: BaseData, TFile: BaseFile](Event, ABC):	1✔
35	"""Base class for Dataset objects.
36
37	Datasets are collections of Data, with methods
38	that simplify repeated operations on the data.
39	"""
40
41	file_cls: type[TFile]	1✔
42
43	def __init__(	1✔
44	self,
45	data: list[TData],
46	name: str \| None = None,
47	suffix: str = "",
48	folder: Path \| None = None,
49	) -> None:
50	"""Instantiate a Dataset object from the Data objects."""
51	self.data = data	1✔
52	self._name = name	1✔
53	self._has_default_name = name is None	1✔
54	self._suffix = suffix	1✔
55	self._folder = folder	1✔
56
57	def __str__(self) -> str:	1✔
58	"""Overwrite __str__."""
59	return self.name	1✔
60
61	def __eq__(self, other: BaseDataset) -> bool:	1✔
62	"""Overwrite __eq__."""
63	return sorted(self.data, key=lambda e: (e.begin, e.end)) == sorted(	1✔
64	other.data,
65	key=lambda e: (e.begin, e.end),
66	)
67
68	@property	1✔
69	def base_name(self) -> str:	1✔
70	"""Name of the dataset without suffix."""
71	return (	1✔
72	self.begin.strftime(TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED)
73	if self._name is None
74	else self._name
75	)
76
77	@base_name.setter	1✔
78	def base_name(self, name: str) -> None:	1✔
79	self._name = name	1✔
80
81	@property	1✔
82	def name(self) -> str:	1✔
83	"""Name of the dataset with suffix."""
84	return self.base_name if not self.suffix else f"{self.base_name}_{self.suffix}"	1✔
85
86	@name.setter	1✔
87	def name(self, name: str \| None) -> None:	1✔
88	self._name = name	1✔
89
90	@property	1✔
91	def suffix(self) -> str:	1✔
92	"""Suffix that is applied to the name of the ads.
93
94	This is used by the public API, for suffixing multiple core_api datasets
95	that are created simultaneously and share the same namewith their specific type,
96	e.g. ``_audio`` or ``_spectro``.
97	"""
98	return self._suffix	1✔
99
100	@suffix.setter	1✔
101	def suffix(self, suffix: str \| None) -> None:	1✔
102	self._suffix = suffix	1✔
103
104	@property	1✔
105	def has_default_name(self) -> bool:	1✔
106	"""Return ``True`` if the dataset has a default name, ``False`` if it has a given name."""
107	return self._has_default_name	1✔
108
109	@property	1✔
110	def begin(self) -> Timestamp:	1✔
111	"""Begin of the first data object."""
112	return min(data.begin for data in self.data)	1✔
113
114	@property	1✔
115	def end(self) -> Timestamp:	1✔
116	"""End of the last data object."""
117	return max(data.end for data in self.data)	1✔
118
119	@property	1✔
120	def files(self) -> set[TFile]:	1✔
121	"""All files referred to by the Dataset."""
122	return {file for data in self.data for file in data.files}	1✔
123
124	@property	1✔
125	def folder(self) -> Path:	1✔
126	"""Folder in which the dataset files are located or to be written."""
127	return (	1✔
128	self._folder
129	if self._folder is not None
130	else next(iter(file.path.parent for file in self.files), None)
131	)
132
133	@folder.setter	1✔
134	def folder(self, folder: Path) -> None:	1✔
135	"""Set the folder in which the dataset files might be written.
136
137	Parameters
138	----------
139	folder: Path
140	The folder in which the dataset files might be written.
141
142	"""
143	self._folder = folder	1✔
144
145	def move_files(self, folder: Path) -> None:	1✔
146	"""Move the dataset files to the destination folder.
147
148	Parameters
149	----------
150	folder: Path
151	Destination folder in which the dataset files will be moved.
152
153	"""
154	for file in tqdm(	1✔
155	self.files,
156	disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
157	):
158	file.move(folder)	1✔
159	self._folder = folder	1✔
160
161	@property	1✔
162	def data_duration(self) -> Timedelta:	1✔
163	"""Return the most frequent duration among the data of this dataset.
164
165	The duration is rounded to the nearest second.
166
167	"""
168	data_durations = [	1✔
169	Timedelta(data.duration).round(freq="1s") for data in self.data
170	]
171	return max(set(data_durations), key=data_durations.count)	1✔
172
173	def remove_empty_data(self, threshold: float = 0.0) -> None:	1✔
174	"""Remove data that has less than ``threshold`` % of non-empty duration.
175
176	Parameters
177	----------
178	threshold: float
179	Threshold percentage of emptiness duration under which the
180	data should be removed.
181	Must be in the ``[0.,1.]`` interval.
182
183	"""
184	if not 0.0 <= threshold <= 1.0:	1✔
185	msg = f"Threshold should be between 0 and 1. Got {threshold}"	1✔
186	raise ValueError(msg)	1✔
187	self.data = [data for data in self.data if data.populated_ratio > threshold]	1✔
188
189	def write(	1✔
190	self,
191	folder: Path,
192	first: int = 0,
193	last: int \| None = None,
194	*,
195	link: bool = False,
196	) -> None:
197	"""Write all data objects in the specified folder.
198
199	Parameters
200	----------
201	folder: Path
202	Folder in which to write the data.
203	link: bool
204	If ``True``, the Data will be bound to the written file.
205	Its items will be replaced with a single item, which will match the whole
206	new File.
207	first: int
208	Index of the first data object to write.
209	last: int \| None
210	Index after the last data object to write.
211
212	"""
213	last = len(self.data) if last is None else last	1✔
214	for data in tqdm(	1✔
215	self.data[first:last],
216	disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
217	):
218	data.write(folder=folder, link=link)	1✔
219
220	def to_dict(self) -> dict:	1✔
221	"""Serialize a ``BaseDataset`` to a dictionary.
222
223	Returns
224	-------
225	dict:
226	The serialized dictionary representing the ``BaseDataset``.
227
228	"""
229	return {	1✔
230	"data": {str(d): d.to_dict() for d in self.data},
231	"name": self._name,
232	"suffix": self.suffix,
233	"folder": str(self.folder),
234	}
235
236	@classmethod	1✔
237	def from_dict(cls, dictionary: dict) -> Self:	1✔
238	"""Deserialize a ``BaseDataset`` from a dictionary.
239
240	Parameters
241	----------
242	dictionary: dict
243	The serialized dictionary representing the ``BaseDataset``.
244
245	Returns
246	-------
247	AudioData
248	The deserialized ``BaseDataset``.
249
250	"""
251	data = cls._data_from_dict(dictionary["data"])	1✔
252	name = dictionary["name"]	1✔
253	suffix = dictionary["suffix"]	1✔
254	folder = Path(dictionary["folder"])	1✔
255	return cls(data=data, name=name, suffix=suffix, folder=folder)	1✔
256
257	@classmethod	1✔
258	@abstractmethod	1✔
259	def _data_from_dict(cls, dictionary: dict) -> list[TData]:	1✔
260	"""Return a list of Data from a serialized dictionary."""
261	...
262
263	def write_json(self, folder: Path) -> None:	1✔
264	"""Write a serialized ``BaseDataset`` to a JSON file."""
265	serialize_json(folder / f"{self.name}.json", self.to_dict())	1✔
266
267	@classmethod	1✔
268	def from_json(cls, file: Path) -> Self:	1✔
269	"""Deserialize a ``BaseDataset`` from a JSON file.
270
271	Parameters
272	----------
273	file: Path
274	Path to the serialized JSON file representing the ``BaseDataset``.
275
276	Returns
277	-------
278	BaseDataset
279	The deserialized ``BaseDataset``.
280
281	"""
UNCOV 282	return cls.from_dict(deserialize_json(file))	×
283
284	@classmethod	1✔
285	def from_files( # noqa: PLR0913	1✔
286	cls,
287	files: list[TFile],
288	begin: Timestamp \| None = None,
289	end: Timestamp \| None = None,
290	mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
291	data_duration: Timedelta \| None = None,
292	overlap: float = 0.0,
293	name: str \| None = None,
294	**kwargs, # noqa: ANN003
295	) -> Self:
296	"""Return a Dataset object from a list of Files.
297
298	Parameters
299	----------
300	files: list[TFile]
301	The list of files contained in the Dataset.
302	begin: Timestamp \| None
303	Begin of the first data object.
304	Defaulted to the begin of the first file.
305	end: Timestamp \| None
306	End of the last data object.
307	Defaulted to the end of the last file.
308	mode: Literal["files", "timedelta_total", "timedelta_file"]
309	Mode of creation of the dataset data from the original files.
310	``"files"``: one data will be created for each file.
311	``"timedelta_total"``: data objects of duration equal to ``data_duration`` will
312	be created from the ``begin`` timestamp to the ``end`` timestamp.
313	``"timedelta_file"``: data objects of duration equal to ``data_duration`` will
314	be created from the beginning of the first file that the ``begin`` timestamp
315	is into, until it would resume in a data beginning between two files.
316	Then, the next data object will be created from the
317	beginning of the next original file and so on.
318	data_duration: Timedelta \| None
319	Duration of the data objects.
320	If mode is set to ``"files"``, this parameter has no effect.
321	If provided, data will be evenly distributed between ``begin`` and ``end``.
322	Else, one data object will cover the whole time period.
323	overlap: float
324	Overlap percentage between consecutive data.
325	name: str\|None
326	Name of the dataset.
327	kwargs:
328	Keyword arguments to pass to the ``cls.data_from_files()`` method.
329
330	Returns
331	-------
332	Self:
333	The Dataset object.
334
335	"""
336	if mode == "files":	1✔
337	data = [cls._data_from_files([f], **kwargs) for f in files]	1✔
338	data = BaseData.remove_overlaps(data)	1✔
339	return cls(data=data, name=name)	1✔
340
341	if not begin:	1✔
342	begin = min(file.begin for file in files)	1✔
343	if not end:	1✔
344	end = max(file.end for file in files)	1✔
345	if data_duration:	1✔
346	data_base = (	1✔
347	cls._get_data_from_files_timedelta_total(
348	begin=begin,
349	end=end,
350	data_duration=data_duration,
351	files=files,
352	overlap=overlap,
353	**kwargs,
354	)
355	if mode == "timedelta_total"
356	else cls._get_data_from_files_timedelta_file(
357	begin=begin,
358	end=end,
359	data_duration=data_duration,
360	files=files,
361	overlap=overlap,
362	**kwargs,
363	)
364	)
365	else:
366	data_base = [	1✔
367	cls._data_from_files(files=files, begin=begin, end=end, **kwargs),
368	]
369	return cls(data_base, name=name)	1✔
370
371	@classmethod	1✔
372	@abstractmethod	1✔
373	def _data_from_files(	1✔
374	cls,
375	files: list[TFile],
376	begin: Timestamp \| None = None,
377	end: Timestamp \| None = None,
378	name: str \| None = None,
379	**kwargs, # noqa: ANN003
380	) -> TData:
381	"""Return a base Dataset object between two timestamps from a list of Files."""
382	...
383
384	@classmethod	1✔
385	def _get_data_from_files_timedelta_total(	1✔
386	cls,
387	begin: Timestamp,
388	end: Timestamp,
389	data_duration: Timedelta,
390	files: list[TFile],
391	overlap: float = 0,
392	**kwargs, # noqa: ANN003
393	) -> list[TData]:
394	if not 0 <= overlap < 1:	1✔
395	msg = f"Overlap ({overlap}) must be between 0 and 1."	1✔
396	raise ValueError(msg)	1✔
397
398	active_file_index = 0	1✔
399	output = []	1✔
400	files = sorted(files, key=lambda f: f.begin)	1✔
401	freq = data_duration * (1 - overlap)	1✔
402
403	for data_begin in tqdm(	1✔
404	date_range(begin, end, freq=freq, inclusive="left"),
405	disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
406	):
407	data_end = Timestamp(data_begin + data_duration)	1✔
408	while (	1✔
409	active_file_index < len(files)
410	and files[active_file_index].end < data_begin
411	):
412	active_file_index += 1	1✔
413	last_active_file_index = active_file_index	1✔
414	while (	1✔
415	last_active_file_index < len(files)
416	and files[last_active_file_index].begin < data_end
417	):
418	last_active_file_index += 1	1✔
419	output.append(	1✔
420	cls._data_from_files(
421	files[active_file_index:last_active_file_index],
422	data_begin,
423	data_end,
424	**kwargs,
425	),
426	)
427
428	return output	1✔
429
430	@classmethod	1✔
431	def _get_data_from_files_timedelta_file(	1✔
432	cls,
433	begin: Timestamp,
434	end: Timestamp,
435	data_duration: Timedelta,
436	files: list[TFile],
437	overlap: float = 0,
438	**kwargs,
439	) -> list[TData]:
440	if not 0 <= overlap < 1:	1✔
441	msg = f"Overlap ({overlap}) must be between 0 and 1."	1✔
442	raise ValueError(msg)	1✔
443
444	files = sorted(files, key=lambda file: file.begin)	1✔
445	first = max(0, bisect(files, begin, key=lambda f: f.begin) - 1)	1✔
446	last = bisect(files, end, key=lambda f: f.begin)	1✔
447
448	data_hop = data_duration * (1 - overlap)	1✔
449
450	output = []	1✔
451	files_chunk = []	1✔
452	for idx, file in tqdm(	1✔
453	enumerate(files[first:last]),
454	disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
455	):
456	if file in files_chunk:	1✔
457	continue	1✔
458	files_chunk = [file]	1✔
459
460	for next_file in files[idx + 1 :]:	1✔
461	upper_data_limit = last_window_end(	1✔
462	begin=file.begin,
463	end=files_chunk[-1].end,
464	window_hop=data_hop,
465	window_duration=data_duration,
466	)
467	if upper_data_limit < next_file.begin:	1✔
468	break	1✔
469	files_chunk.append(next_file)	1✔
470
471	output.extend(	1✔
472	cls._data_from_files(
473	files,
474	data_begin,
475	data_begin + data_duration,
476	**kwargs,
477	)
478	for data_begin in date_range(
479	file.begin,
480	files_chunk[-1].end,
481	freq=data_hop,
482	inclusive="left",
483	)
484	)
485
486	return output	1✔
487
488	@classmethod	1✔
489	def from_folder( # noqa: PLR0913	1✔
490	cls: type[Self],
491	folder: Path,
492	strptime_format: str \| None,
493	begin: Timestamp \| None = None,
494	end: Timestamp \| None = None,
495	timezone: str \| pytz.timezone \| None = None,
496	mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
497	overlap: float = 0.0,
498	data_duration: Timedelta \| None = None,
499	first_file_begin: Timestamp \| None = None,
500	name: str \| None = None,
501	**kwargs, # noqa: ANN003
502	) -> Self:
503	"""Return a Dataset from a folder containing the base files.
504
505	Parameters
506	----------
507	folder: Path
508	The folder containing the files.
509	strptime_format: str \| None
510	The strptime format used in the filenames.
511	It should use valid strftime codes (https://strftime.org/).
512	If None, the first audio file of the folder will start
513	at ``first_file_begin``, and each following file will start
514	at the end of the previous one.
515	begin: Timestamp \| None
516	The begin of the dataset.
517	Defaulted to the begin of the first file.
518	end: Timestamp \| None
519	The end of the dataset.
520	Defaulted to the end of the last file.
521	timezone: str \| pytz.timezone \| None
522	The timezone in which the file should be localized.
523	If None, the file begin/end will be tz-naive.
524	If different from a timezone parsed from the filename, the timestamps'
525	timezone will be converted from the parsed timezone
526	to the specified timezone.
527	mode: Literal["files", "timedelta_total", "timedelta_file"]
528	Mode of creation of the dataset data from the original files.
529	``"files"``: one data will be created for each file.
530	``"timedelta_total"``: data objects of duration equal to ``data_duration`` will
531	be created from the ``begin`` timestamp to the ``end`` timestamp.
532	``"timedelta_file"``: data objects of duration equal to ``data_duration`` will
533	be created from the beginning of the first file that the ``begin`` timestamp is into, until it would resume
534	in a data beginning between two files. Then, the next data object will be created from the
535	beginning of the next original file and so on.
536	overlap: float
537	Overlap percentage between consecutive data.
538	data_duration: Timedelta \| None
539	Duration of the data objects.
540	If mode is set to ``"files"``, this parameter has no effect.
541	If provided, data will be evenly distributed between ``begin`` and ``end``.
542	Else, one object will cover the whole time period.
543	first_file_begin: Timestamp \| None
544	Timestamp of the first audio file being processed.
545	Will be ignored if ``striptime_format`` is specified.
546	name: str\|None
547	Name of the dataset.
548	kwargs:
549	Keyword arguments to pass to the ``cls.from_files()`` method.
550
551	Returns
552	-------
553	Self:
554	The dataset.
555
556	"""
557	valid_files = []	1✔
558	rejected_files = []	1✔
559	first_file_begin = first_file_begin or Timestamp("2020-01-01 00:00:00")	1✔
560	for file in tqdm(	1✔
561	sorted(folder.iterdir()),
562	disable=os.getenv("DISABLE_TQDM", "False").lower() in ("true", "1", "t"),
563	):
564	is_file_ok = cls._parse_file(	1✔
565	file=file,
566	strptime_format=strptime_format,
567	timezone=timezone,
568	begin_timestamp=first_file_begin,
569	valid_files=valid_files,
570	rejected_files=rejected_files,
571	)
572	if is_file_ok:	1✔
573	first_file_begin += valid_files[-1].duration	1✔
574
575	if rejected_files:	1✔
576	rejected_files = "\n\t".join(f.name for f in rejected_files)	1✔
577	glc.logger.warning(	1✔
578	f"The following files couldn't be parsed:\n\t{rejected_files}",
579	)
580
581	if not valid_files:	1✔
582	msg = f"No valid file found in {folder}"	1✔
583	raise FileNotFoundError(msg)	1✔
584
585	return cls.from_files(	1✔
586	files=valid_files,
587	begin=begin,
588	end=end,
589	mode=mode,
590	overlap=overlap,
591	data_duration=data_duration,
592	name=name,
593	**kwargs,
594	)
595
596	@classmethod	1✔
597	def _parse_file(	1✔
598	cls: type[Self],
599	file: Path,
600	strptime_format: str,
601	timezone: str \| pytz.timezone \| None,
602	begin_timestamp: Timestamp,
603	valid_files: list[TFile],
604	rejected_files: list[Path],
605	) -> bool:
606	if file.suffix.lower() not in cls.file_cls.supported_extensions:	1✔
607	return False	1✔
608	try:	1✔
609	if strptime_format is None:	1✔
610	f = cls.file_cls(file, begin=begin_timestamp, timezone=timezone)	1✔
611	else:
612	f = cls.file_cls(	1✔
613	file,
614	strptime_format=strptime_format,
615	timezone=timezone,
616	)
617	valid_files.append(f)	1✔
618	except (ValueError, LibsndfileError):	1✔
619	rejected_files.append(file)	1✔
620	return False	1✔
621	else:
622	return True	1✔

Project-OSmOSE / OSEkit / 21402000858

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous