19471577242

Committed 18 Nov 2025 03:30PM UTC coverage: 96.898% (+4.3%) from 92.572%

Build # 19471577242

Build Type

Pull #281

github

Committed by

web-flow

Commit Message

Merge 1b5cc878a into f45ff2fad

Pull Request Pull Request #281: Job rework

Run Details

567 of 572 new or added lines in 6 files covered. (99.13%)

22 existing lines in 3 files now uncovered.

3873 of 3997 relevant lines covered (96.9%)

0.97 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.13

/src/osekit/core_api/base_dataset.py

"""BaseDataset: Base class for the Dataset objects.

Datasets are collections of Data, with methods
that simplify repeated operations on the data.
"""

from __future__ import annotations

import os
from bisect import bisect
from pathlib import Path
from typing import TYPE_CHECKING, Generic, Literal, TypeVar

from pandas import Timedelta, Timestamp, date_range
from soundfile import LibsndfileError
from tqdm import tqdm

from osekit.config import TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED
from osekit.config import global_logging_context as glc
from osekit.core_api.base_data import BaseData
from osekit.core_api.base_file import BaseFile
from osekit.core_api.event import Event
from osekit.core_api.json_serializer import deserialize_json, serialize_json
from osekit.utils.timestamp_utils import last_window_end

if TYPE_CHECKING:
    import pytz

TData = TypeVar("TData", bound=BaseData)
TFile = TypeVar("TFile", bound=BaseFile)


class BaseDataset(Generic[TData, TFile], Event):
    """Base class for Dataset objects.

    Datasets are collections of Data, with methods
    that simplify repeated operations on the data.
    """

    def __init__(
        self,
        data: list[TData],
        name: str | None = None,
        suffix: str = "",
        folder: Path | None = None,
    ) -> None:
        """Instantiate a Dataset object from the Data objects."""
        self.data = data
        self._name = name
        self._has_default_name = name is None
        self._suffix = suffix
        self._folder = folder

    def __str__(self) -> str:
        """Overwrite __str__."""
        return self.name

    def __eq__(self, other: BaseDataset) -> bool:
        """Overwrite __eq__."""
        return sorted(self.data, key=lambda e: (e.begin, e.end)) == sorted(
            other.data,
            key=lambda e: (e.begin, e.end),
        )

    @property
    def base_name(self) -> str:
        """Name of the dataset without suffix."""
        return (
            self.begin.strftime(TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED)
            if self._name is None
            else self._name
        )

    @base_name.setter
    def base_name(self, name: str) -> None:
        self._name = name

    @property
    def name(self) -> str:
        """Name of the dataset with suffix."""
        return self.base_name if not self.suffix else f"{self.base_name}_{self.suffix}"

    @name.setter
    def name(self, name: str | None) -> None:
        self._name = name

    @property
    def suffix(self) -> str:
        """Suffix that is applied to the name of the ads.

        This is used by the public API, for suffixing multiple core_api datasets
        that are created simultaneously and share the same namewith their specific type,
         e.g. _audio or _spectro.
        """
        return self._suffix

    @suffix.setter
    def suffix(self, suffix: str | None) -> None:
        self._suffix = suffix

    @property
    def has_default_name(self) -> bool:
        """Return True if the dataset has a default name, False if it has a given name."""
        return self._has_default_name

    @property
    def begin(self) -> Timestamp:
        """Begin of the first data object."""
        return min(data.begin for data in self.data)

    @property
    def end(self) -> Timestamp:
        """End of the last data object."""
        return max(data.end for data in self.data)

    @property
    def files(self) -> set[TFile]:
        """All files referred to by the Dataset."""
        return {file for data in self.data for file in data.files}

    @property
    def folder(self) -> Path:
        """Folder in which the dataset files are located or to be written."""
        return (
            self._folder
            if self._folder is not None
            else next(iter(file.path.parent for file in self.files), None)
        )

    @folder.setter
    def folder(self, folder: Path) -> None:
        """Set the folder in which the dataset files might be written.

        Parameters
        ----------
        folder: Path
            The folder in which the dataset files might be written.

        """
        self._folder = folder

    def move_files(self, folder: Path) -> None:
        """Move the dataset files to the destination folder.

        Parameters
        ----------
        folder: Path
            Destination folder in which the dataset files will be moved.

        """
        for file in tqdm(self.files, disable=os.environ.get("DISABLE_TQDM", "")):
            file.move(folder)
        self._folder = folder

    @property
    def data_duration(self) -> Timedelta:
        """Return the most frequent duration among durations of the data of this dataset, rounded to the nearest second."""
        data_durations = [
            Timedelta(data.duration).round(freq="1s") for data in self.data
        ]
        return max(set(data_durations), key=data_durations.count)

    def write(
        self,
        folder: Path,
        link: bool = False,
        first: int = 0,
        last: int | None = None,
    ) -> None:
        """Write all data objects in the specified folder.

        Parameters
        ----------
        folder: Path
            Folder in which to write the data.
        link: bool
            If True, the Data will be bound to the written file.
            Its items will be replaced with a single item, which will match the whole
            new File.
        first: int
            Index of the first data object to write.
        last: int | None
            Index after the last data object to write.

        """
        last = len(self.data) if last is None else last
        for data in tqdm(
            self.data[first:last],
            disable=os.environ.get("DISABLE_TQDM", ""),
        ):
            data.write(folder=folder, link=link)

    def to_dict(self) -> dict:
        """Serialize a BaseDataset to a dictionary.

        Returns
        -------
        dict:
            The serialized dictionary representing the BaseDataset.

        """
        return {
            "data": {str(d): d.to_dict() for d in self.data},
            "name": self._name,
            "suffix": self.suffix,
            "folder": str(self.folder),
        }

    @classmethod
    def from_dict(cls, dictionary: dict) -> BaseDataset:
        """Deserialize a BaseDataset from a dictionary.

        Parameters
        ----------
        dictionary: dict
            The serialized dictionary representing the BaseData.

        Returns
        -------
        AudioData
            The deserialized BaseDataset.

        """
        return cls(
            [BaseData.from_dict(d) for d in dictionary["data"].values()],
            name=dictionary["name"],
            suffix=dictionary["suffix"],
            folder=Path(dictionary["folder"]),
        )

    def write_json(self, folder: Path) -> None:
        """Write a serialized BaseDataset to a JSON file."""
        serialize_json(folder / f"{self.name}.json", self.to_dict())

    @classmethod
    def from_json(cls, file: Path) -> BaseDataset:
        """Deserialize a BaseDataset from a JSON file.

        Parameters
        ----------
        file: Path
            Path to the serialized JSON file representing the BaseDataset.

        Returns
        -------
        BaseDataset
            The deserialized BaseDataset.

        """
        return cls.from_dict(deserialize_json(file))

    @classmethod
    def from_files(  # noqa: PLR0913
        cls,
        files: list[TFile],
        begin: Timestamp | None = None,
        end: Timestamp | None = None,
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
        data_duration: Timedelta | None = None,
        overlap: float = 0.0,
        name: str | None = None,
    ) -> BaseDataset:
        """Return a base BaseDataset object from a list of Files.

        Parameters
        ----------
        files: list[TFile]
            The list of files contained in the Dataset.
        begin: Timestamp | None
            Begin of the first data object.
            Defaulted to the begin of the first file.
        end: Timestamp | None
            End of the last data object.
            Defaulted to the end of the last file.
        mode: Literal["files", "timedelta_total", "timedelta_file"]
            Mode of creation of the dataset data from the original files.
            "files": one data will be created for each file.
            "timedelta_total": data objects of duration equal to data_duration will
            be created from the begin timestamp to the end timestamp.
            "timedelta_file": data objects of duration equal to data_duration will
            be created from the beginning of the first file that the begin timestamp is into, until it would resume
            in a data beginning between two files. Then, the next data object will be created from the
            beginning of the next original file and so on.
        data_duration: Timedelta | None
            Duration of the data objects.
            If mode is set to "files", this parameter has no effect.
            If provided, data will be evenly distributed between begin and end.
            Else, one data object will cover the whole time period.
        overlap: float
            Overlap percentage between consecutive data.
        name: str|None
            Name of the dataset.

        Returns
        -------
        BaseDataset[TItem, TFile]:
        The DataBase object.

        """
        if mode == "files":
            data_base = [BaseData.from_files([f]) for f in files]
            data_base = BaseData.remove_overlaps(data_base)
            return cls(data=data_base, name=name)

        if not begin:
            begin = min(file.begin for file in files)
        if not end:
            end = max(file.end for file in files)
        if data_duration:
            data_base = (
                cls._get_base_data_from_files_timedelta_total(
                    begin=begin,
                    end=end,
                    data_duration=data_duration,
                    files=files,
                    overlap=overlap,
                )
                if mode == "timedelta_total"
                else cls._get_base_data_from_files_timedelta_file(
                    begin=begin,
                    end=end,
                    data_duration=data_duration,
                    files=files,
                    overlap=overlap,
                )
            )
        else:
            data_base = [BaseData.from_files(files, begin=begin, end=end)]
        return cls(data_base, name=name)

    @classmethod
    def _get_base_data_from_files_timedelta_total(
        cls,
        begin: Timestamp,
        end: Timestamp,
        data_duration: Timedelta,
        files: list[TFile],
        overlap: float = 0,
    ) -> list[BaseData]:
        if not 0 <= overlap < 1:
            msg = f"Overlap ({overlap}) must be between 0 and 1."
            raise ValueError(msg)

        active_file_index = 0
        output = []
        files = sorted(files, key=lambda f: f.begin)
        freq = data_duration * (1 - overlap)

        for data_begin in tqdm(
            date_range(begin, end, freq=freq, inclusive="left"),
            disable=os.environ.get("DISABLE_TQDM", ""),
        ):
            data_end = Timestamp(data_begin + data_duration)
            while (
                active_file_index < len(files)
                and files[active_file_index].end < data_begin
            ):
                active_file_index += 1
            last_active_file_index = active_file_index
            while (
                last_active_file_index < len(files)
                and files[last_active_file_index].begin < data_end
            ):
                last_active_file_index += 1
            output.append(
                BaseData.from_files(
                    files[active_file_index:last_active_file_index],
                    data_begin,
                    data_end,
                ),
            )

        return output

    @classmethod
    def _get_base_data_from_files_timedelta_file(
        cls,
        begin: Timestamp,
        end: Timestamp,
        data_duration: Timedelta,
        files: list[TFile],
        overlap: float = 0,
    ) -> list[BaseData]:
        if not 0 <= overlap < 1:
            msg = f"Overlap ({overlap}) must be between 0 and 1."
            raise ValueError(msg)

        files = sorted(files, key=lambda file: file.begin)
        first = max(0, bisect(files, begin, key=lambda f: f.begin) - 1)
        last = bisect(files, end, key=lambda f: f.begin)

        data_hop = data_duration * (1 - overlap)

        output = []
        files_chunk = []
        for idx, file in tqdm(
            enumerate(files[first:last]),
            disable=os.environ.get("DISABLE_TQDM", ""),
        ):
            if file in files_chunk:
                continue
            files_chunk = [file]

            for next_file in files[idx + 1 :]:
                upper_data_limit = last_window_end(
                    begin=file.begin,
                    end=files_chunk[-1].end,
                    window_hop=data_hop,
                    window_duration=data_duration,
                )
                if upper_data_limit < next_file.begin:
                    break
                files_chunk.append(next_file)

            output.extend(
                BaseData.from_files(files, data_begin, data_begin + data_duration)
                for data_begin in date_range(
                    file.begin,
                    files_chunk[-1].end,
                    freq=data_hop,
                    inclusive="left",
                )
            )

        return output

    @classmethod
    def from_folder(  # noqa: PLR0913
        cls,
        folder: Path,
        strptime_format: str,
        file_class: type[TFile] = BaseFile,
        supported_file_extensions: list[str] | None = None,
        begin: Timestamp | None = None,
        end: Timestamp | None = None,
        timezone: str | pytz.timezone | None = None,
        mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
        overlap: float = 0.0,
        data_duration: Timedelta | None = None,
        name: str | None = None,
    ) -> BaseDataset:
        """Return a BaseDataset from a folder containing the base files.

        Parameters
        ----------
        folder: Path
            The folder containing the files.
        strptime_format: str
            The strptime format of the timestamps in the file names.
        file_class: type[Tfile]
            Derived type of BaseFile used to instantiate the dataset.
        supported_file_extensions: list[str]
            List of supported file extensions for parsing TFiles.
        begin: Timestamp | None
            The begin of the dataset.
            Defaulted to the begin of the first file.
        end: Timestamp | None
            The end of the dataset.
            Defaulted to the end of the last file.
        timezone: str | pytz.timezone | None
            The timezone in which the file should be localized.
            If None, the file begin/end will be tz-naive.
            If different from a timezone parsed from the filename, the timestamps'
            timezone will be converted from the parsed timezone
            to the specified timezone.
        mode: Literal["files", "timedelta_total", "timedelta_file"]
            Mode of creation of the dataset data from the original files.
            "files": one data will be created for each file.
            "timedelta_total": data objects of duration equal to data_duration will
            be created from the begin timestamp to the end timestamp.
            "timedelta_file": data objects of duration equal to data_duration will
            be created from the beginning of the first file that the begin timestamp is into, until it would resume
            in a data beginning between two files. Then, the next data object will be created from the
            beginning of the next original file and so on.
        overlap: float
            Overlap percentage between consecutive data.
        data_duration: Timedelta | None
            Duration of the data objects.
            If mode is set to "files", this parameter has no effect.
            If provided, data will be evenly distributed between begin and end.
            Else, one object will cover the whole time period.
        name: str|None
            Name of the dataset.

        Returns
        -------
        Basedataset:
            The base dataset.

        """
        if supported_file_extensions is None:
            supported_file_extensions = []
        valid_files = []
        rejected_files = []
        for file in tqdm(folder.iterdir(), disable=os.environ.get("DISABLE_TQDM", "")):
            if file.suffix.lower() not in supported_file_extensions:
                continue
            try:
                f = file_class(file, strptime_format=strptime_format, timezone=timezone)
                valid_files.append(f)
            except (ValueError, LibsndfileError):
                rejected_files.append(file)

        if rejected_files:
            rejected_files = "\n\t".join(f.name for f in rejected_files)
            glc.logger.warning(
                f"The following files couldn't be parsed:\n\t{rejected_files}",
            )

        if not valid_files:
            raise FileNotFoundError(f"No valid file found in {folder}.")

        return BaseDataset.from_files(
            files=valid_files,
            begin=begin,
            end=end,
            mode=mode,
            overlap=overlap,
            data_duration=data_duration,
            name=name,
        )

1	"""BaseDataset: Base class for the Dataset objects.
2
3	Datasets are collections of Data, with methods
4	that simplify repeated operations on the data.
5	"""
6
7	from __future__ import annotations	1✔
8
9	import os	1✔
10	from bisect import bisect	1✔
11	from pathlib import Path	1✔
12	from typing import TYPE_CHECKING, Generic, Literal, TypeVar	1✔
13
14	from pandas import Timedelta, Timestamp, date_range	1✔
15	from soundfile import LibsndfileError	1✔
16	from tqdm import tqdm	1✔
17
18	from osekit.config import TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED	1✔
19	from osekit.config import global_logging_context as glc	1✔
20	from osekit.core_api.base_data import BaseData	1✔
21	from osekit.core_api.base_file import BaseFile	1✔
22	from osekit.core_api.event import Event	1✔
23	from osekit.core_api.json_serializer import deserialize_json, serialize_json	1✔
24	from osekit.utils.timestamp_utils import last_window_end	1✔
25
26	if TYPE_CHECKING:
27	import pytz
28
29	TData = TypeVar("TData", bound=BaseData)	1✔
30	TFile = TypeVar("TFile", bound=BaseFile)	1✔
31
32
33	class BaseDataset(Generic[TData, TFile], Event):	1✔
34	"""Base class for Dataset objects.
35
36	Datasets are collections of Data, with methods
37	that simplify repeated operations on the data.
38	"""
39
40	def __init__(	1✔
41	self,
42	data: list[TData],
43	name: str \| None = None,
44	suffix: str = "",
45	folder: Path \| None = None,
46	) -> None:
47	"""Instantiate a Dataset object from the Data objects."""
48	self.data = data	1✔
49	self._name = name	1✔
50	self._has_default_name = name is None	1✔
51	self._suffix = suffix	1✔
52	self._folder = folder	1✔
53
54	def __str__(self) -> str:	1✔
55	"""Overwrite __str__."""
56	return self.name	1✔
57
58	def __eq__(self, other: BaseDataset) -> bool:	1✔
59	"""Overwrite __eq__."""
60	return sorted(self.data, key=lambda e: (e.begin, e.end)) == sorted(	1✔
61	other.data,
62	key=lambda e: (e.begin, e.end),
63	)
64
65	@property	1✔
66	def base_name(self) -> str:	1✔
67	"""Name of the dataset without suffix."""
68	return (	1✔
69	self.begin.strftime(TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED)
70	if self._name is None
71	else self._name
72	)
73
74	@base_name.setter	1✔
75	def base_name(self, name: str) -> None:	1✔
76	self._name = name	1✔
77
78	@property	1✔
79	def name(self) -> str:	1✔
80	"""Name of the dataset with suffix."""
81	return self.base_name if not self.suffix else f"{self.base_name}_{self.suffix}"	1✔
82
83	@name.setter	1✔
84	def name(self, name: str \| None) -> None:	1✔
85	self._name = name	1✔
86
87	@property	1✔
88	def suffix(self) -> str:	1✔
89	"""Suffix that is applied to the name of the ads.
90
91	This is used by the public API, for suffixing multiple core_api datasets
92	that are created simultaneously and share the same namewith their specific type,
93	e.g. _audio or _spectro.
94	"""
95	return self._suffix	1✔
96
97	@suffix.setter	1✔
98	def suffix(self, suffix: str \| None) -> None:	1✔
99	self._suffix = suffix	1✔
100
101	@property	1✔
102	def has_default_name(self) -> bool:	1✔
103	"""Return True if the dataset has a default name, False if it has a given name."""
104	return self._has_default_name	1✔
105
106	@property	1✔
107	def begin(self) -> Timestamp:	1✔
108	"""Begin of the first data object."""
109	return min(data.begin for data in self.data)	1✔
110
111	@property	1✔
112	def end(self) -> Timestamp:	1✔
113	"""End of the last data object."""
114	return max(data.end for data in self.data)	1✔
115
116	@property	1✔
117	def files(self) -> set[TFile]:	1✔
118	"""All files referred to by the Dataset."""
119	return {file for data in self.data for file in data.files}	1✔
120
121	@property	1✔
122	def folder(self) -> Path:	1✔
123	"""Folder in which the dataset files are located or to be written."""
124	return (	1✔
125	self._folder
126	if self._folder is not None
127	else next(iter(file.path.parent for file in self.files), None)
128	)
129
130	@folder.setter	1✔
131	def folder(self, folder: Path) -> None:	1✔
132	"""Set the folder in which the dataset files might be written.
133
134	Parameters
135	----------
136	folder: Path
137	The folder in which the dataset files might be written.
138
139	"""
140	self._folder = folder	1✔
141
142	def move_files(self, folder: Path) -> None:	1✔
143	"""Move the dataset files to the destination folder.
144
145	Parameters
146	----------
147	folder: Path
148	Destination folder in which the dataset files will be moved.
149
150	"""
151	for file in tqdm(self.files, disable=os.environ.get("DISABLE_TQDM", "")):	1✔
152	file.move(folder)	1✔
153	self._folder = folder	1✔
154
155	@property	1✔
156	def data_duration(self) -> Timedelta:	1✔
157	"""Return the most frequent duration among durations of the data of this dataset, rounded to the nearest second."""
158	data_durations = [	1✔
159	Timedelta(data.duration).round(freq="1s") for data in self.data
160	]
161	return max(set(data_durations), key=data_durations.count)	1✔
162
163	def write(	1✔
164	self,
165	folder: Path,
166	link: bool = False,
167	first: int = 0,
168	last: int \| None = None,
169	) -> None:
170	"""Write all data objects in the specified folder.
171
172	Parameters
173	----------
174	folder: Path
175	Folder in which to write the data.
176	link: bool
177	If True, the Data will be bound to the written file.
178	Its items will be replaced with a single item, which will match the whole
179	new File.
180	first: int
181	Index of the first data object to write.
182	last: int \| None
183	Index after the last data object to write.
184
185	"""
186	last = len(self.data) if last is None else last	1✔
187	for data in tqdm(	1✔
188	self.data[first:last],
189	disable=os.environ.get("DISABLE_TQDM", ""),
190	):
191	data.write(folder=folder, link=link)	1✔
192
193	def to_dict(self) -> dict:	1✔
194	"""Serialize a BaseDataset to a dictionary.
195
196	Returns
197	-------
198	dict:
199	The serialized dictionary representing the BaseDataset.
200
201	"""
202	return {	1✔
203	"data": {str(d): d.to_dict() for d in self.data},
204	"name": self._name,
205	"suffix": self.suffix,
206	"folder": str(self.folder),
207	}
208
209	@classmethod	1✔
210	def from_dict(cls, dictionary: dict) -> BaseDataset:	1✔
211	"""Deserialize a BaseDataset from a dictionary.
212
213	Parameters
214	----------
215	dictionary: dict
216	The serialized dictionary representing the BaseData.
217
218	Returns
219	-------
220	AudioData
221	The deserialized BaseDataset.
222
223	"""
224	return cls(	×
225	[BaseData.from_dict(d) for d in dictionary["data"].values()],
226	name=dictionary["name"],
227	suffix=dictionary["suffix"],
228	folder=Path(dictionary["folder"]),
229	)
230
231	def write_json(self, folder: Path) -> None:	1✔
232	"""Write a serialized BaseDataset to a JSON file."""
233	serialize_json(folder / f"{self.name}.json", self.to_dict())	1✔
234
235	@classmethod	1✔
236	def from_json(cls, file: Path) -> BaseDataset:	1✔
237	"""Deserialize a BaseDataset from a JSON file.
238
239	Parameters
240	----------
241	file: Path
242	Path to the serialized JSON file representing the BaseDataset.
243
244	Returns
245	-------
246	BaseDataset
247	The deserialized BaseDataset.
248
249	"""
250	return cls.from_dict(deserialize_json(file))	×
251
252	@classmethod	1✔
253	def from_files( # noqa: PLR0913	1✔
254	cls,
255	files: list[TFile],
256	begin: Timestamp \| None = None,
257	end: Timestamp \| None = None,
258	mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
259	data_duration: Timedelta \| None = None,
260	overlap: float = 0.0,
261	name: str \| None = None,
262	) -> BaseDataset:
263	"""Return a base BaseDataset object from a list of Files.
264
265	Parameters
266	----------
267	files: list[TFile]
268	The list of files contained in the Dataset.
269	begin: Timestamp \| None
270	Begin of the first data object.
271	Defaulted to the begin of the first file.
272	end: Timestamp \| None
273	End of the last data object.
274	Defaulted to the end of the last file.
275	mode: Literal["files", "timedelta_total", "timedelta_file"]
276	Mode of creation of the dataset data from the original files.
277	"files": one data will be created for each file.
278	"timedelta_total": data objects of duration equal to data_duration will
279	be created from the begin timestamp to the end timestamp.
280	"timedelta_file": data objects of duration equal to data_duration will
281	be created from the beginning of the first file that the begin timestamp is into, until it would resume
282	in a data beginning between two files. Then, the next data object will be created from the
283	beginning of the next original file and so on.
284	data_duration: Timedelta \| None
285	Duration of the data objects.
286	If mode is set to "files", this parameter has no effect.
287	If provided, data will be evenly distributed between begin and end.
288	Else, one data object will cover the whole time period.
289	overlap: float
290	Overlap percentage between consecutive data.
291	name: str\|None
292	Name of the dataset.
293
294	Returns
295	-------
296	BaseDataset[TItem, TFile]:
297	The DataBase object.
298
299	"""
300	if mode == "files":	1✔
301	data_base = [BaseData.from_files([f]) for f in files]	1✔
302	data_base = BaseData.remove_overlaps(data_base)	1✔
303	return cls(data=data_base, name=name)	1✔
304
305	if not begin:	1✔
306	begin = min(file.begin for file in files)	1✔
307	if not end:	1✔
308	end = max(file.end for file in files)	1✔
309	if data_duration:	1✔
310	data_base = (	1✔
311	cls._get_base_data_from_files_timedelta_total(
312	begin=begin,
313	end=end,
314	data_duration=data_duration,
315	files=files,
316	overlap=overlap,
317	)
318	if mode == "timedelta_total"
319	else cls._get_base_data_from_files_timedelta_file(
320	begin=begin,
321	end=end,
322	data_duration=data_duration,
323	files=files,
324	overlap=overlap,
325	)
326	)
327	else:
328	data_base = [BaseData.from_files(files, begin=begin, end=end)]	1✔
329	return cls(data_base, name=name)	1✔
330
331	@classmethod	1✔
332	def _get_base_data_from_files_timedelta_total(	1✔
333	cls,
334	begin: Timestamp,
335	end: Timestamp,
336	data_duration: Timedelta,
337	files: list[TFile],
338	overlap: float = 0,
339	) -> list[BaseData]:
340	if not 0 <= overlap < 1:	1✔
341	msg = f"Overlap ({overlap}) must be between 0 and 1."	1✔
342	raise ValueError(msg)	1✔
343
344	active_file_index = 0	1✔
345	output = []	1✔
346	files = sorted(files, key=lambda f: f.begin)	1✔
347	freq = data_duration * (1 - overlap)	1✔
348
349	for data_begin in tqdm(	1✔
350	date_range(begin, end, freq=freq, inclusive="left"),
351	disable=os.environ.get("DISABLE_TQDM", ""),
352	):
353	data_end = Timestamp(data_begin + data_duration)	1✔
354	while (	1✔
355	active_file_index < len(files)
356	and files[active_file_index].end < data_begin
357	):
358	active_file_index += 1	1✔
359	last_active_file_index = active_file_index	1✔
360	while (	1✔
361	last_active_file_index < len(files)
362	and files[last_active_file_index].begin < data_end
363	):
364	last_active_file_index += 1	1✔
365	output.append(	1✔
366	BaseData.from_files(
367	files[active_file_index:last_active_file_index],
368	data_begin,
369	data_end,
370	),
371	)
372
373	return output	1✔
374
375	@classmethod	1✔
376	def _get_base_data_from_files_timedelta_file(	1✔
377	cls,
378	begin: Timestamp,
379	end: Timestamp,
380	data_duration: Timedelta,
381	files: list[TFile],
382	overlap: float = 0,
383	) -> list[BaseData]:
384	if not 0 <= overlap < 1:	1✔
385	msg = f"Overlap ({overlap}) must be between 0 and 1."	1✔
386	raise ValueError(msg)	1✔
387
388	files = sorted(files, key=lambda file: file.begin)	1✔
389	first = max(0, bisect(files, begin, key=lambda f: f.begin) - 1)	1✔
390	last = bisect(files, end, key=lambda f: f.begin)	1✔
391
392	data_hop = data_duration * (1 - overlap)	1✔
393
394	output = []	1✔
395	files_chunk = []	1✔
396	for idx, file in tqdm(	1✔
397	enumerate(files[first:last]),
398	disable=os.environ.get("DISABLE_TQDM", ""),
399	):
400	if file in files_chunk:	1✔
401	continue	1✔
402	files_chunk = [file]	1✔
403
404	for next_file in files[idx + 1 :]:	1✔
405	upper_data_limit = last_window_end(	1✔
406	begin=file.begin,
407	end=files_chunk[-1].end,
408	window_hop=data_hop,
409	window_duration=data_duration,
410	)
411	if upper_data_limit < next_file.begin:	1✔
412	break	1✔
413	files_chunk.append(next_file)	1✔
414
415	output.extend(	1✔
416	BaseData.from_files(files, data_begin, data_begin + data_duration)
417	for data_begin in date_range(
418	file.begin,
419	files_chunk[-1].end,
420	freq=data_hop,
421	inclusive="left",
422	)
423	)
424
425	return output	1✔
426
427	@classmethod	1✔
428	def from_folder( # noqa: PLR0913	1✔
429	cls,
430	folder: Path,
431	strptime_format: str,
432	file_class: type[TFile] = BaseFile,
433	supported_file_extensions: list[str] \| None = None,
434	begin: Timestamp \| None = None,
435	end: Timestamp \| None = None,
436	timezone: str \| pytz.timezone \| None = None,
437	mode: Literal["files", "timedelta_total", "timedelta_file"] = "timedelta_total",
438	overlap: float = 0.0,
439	data_duration: Timedelta \| None = None,
440	name: str \| None = None,
441	) -> BaseDataset:
442	"""Return a BaseDataset from a folder containing the base files.
443
444	Parameters
445	----------
446	folder: Path
447	The folder containing the files.
448	strptime_format: str
449	The strptime format of the timestamps in the file names.
450	file_class: type[Tfile]
451	Derived type of BaseFile used to instantiate the dataset.
452	supported_file_extensions: list[str]
453	List of supported file extensions for parsing TFiles.
454	begin: Timestamp \| None
455	The begin of the dataset.
456	Defaulted to the begin of the first file.
457	end: Timestamp \| None
458	The end of the dataset.
459	Defaulted to the end of the last file.
460	timezone: str \| pytz.timezone \| None
461	The timezone in which the file should be localized.
462	If None, the file begin/end will be tz-naive.
463	If different from a timezone parsed from the filename, the timestamps'
464	timezone will be converted from the parsed timezone
465	to the specified timezone.
466	mode: Literal["files", "timedelta_total", "timedelta_file"]
467	Mode of creation of the dataset data from the original files.
468	"files": one data will be created for each file.
469	"timedelta_total": data objects of duration equal to data_duration will
470	be created from the begin timestamp to the end timestamp.
471	"timedelta_file": data objects of duration equal to data_duration will
472	be created from the beginning of the first file that the begin timestamp is into, until it would resume
473	in a data beginning between two files. Then, the next data object will be created from the
474	beginning of the next original file and so on.
475	overlap: float
476	Overlap percentage between consecutive data.
477	data_duration: Timedelta \| None
478	Duration of the data objects.
479	If mode is set to "files", this parameter has no effect.
480	If provided, data will be evenly distributed between begin and end.
481	Else, one object will cover the whole time period.
482	name: str\|None
483	Name of the dataset.
484
485	Returns
486	-------
487	Basedataset:
488	The base dataset.
489
490	"""
491	if supported_file_extensions is None:	1✔
UNCOV 492	supported_file_extensions = []	×
493	valid_files = []	1✔
494	rejected_files = []	1✔
495	for file in tqdm(folder.iterdir(), disable=os.environ.get("DISABLE_TQDM", "")):	1✔
496	if file.suffix.lower() not in supported_file_extensions:	1✔
497	continue	1✔
498	try:	1✔
499	f = file_class(file, strptime_format=strptime_format, timezone=timezone)	1✔
500	valid_files.append(f)	1✔
501	except (ValueError, LibsndfileError):	1✔
502	rejected_files.append(file)	1✔
503
504	if rejected_files:	1✔
505	rejected_files = "\n\t".join(f.name for f in rejected_files)	1✔
506	glc.logger.warning(	1✔
507	f"The following files couldn't be parsed:\n\t{rejected_files}",
508	)
509
510	if not valid_files:	1✔
511	raise FileNotFoundError(f"No valid file found in {folder}.")	1✔
512
513	return BaseDataset.from_files(	1✔
514	files=valid_files,
515	begin=begin,
516	end=end,
517	mode=mode,
518	overlap=overlap,
519	data_duration=data_duration,
520	name=name,
521	)

Project-OSmOSE / OSEkit / 19471577242

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous