12876831595

Committed 20 Jan 2025 10:55PM UTC coverage: 92.174% (+0.4%) from 91.801%

Build # 12876831595

Build Type

Pull #437

github

Committed by

web-flow

Commit Message

Merge pull request #555 from OpenCOMPES/config_renaming

use user platformdir also for user config

Pull Request Pull Request #437: Upgrade to V1

Run Details

2235 of 2372 new or added lines in 53 files covered. (94.22%)

4 existing lines in 1 file now uncovered.

7703 of 8357 relevant lines covered (92.17%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.44

/src/sed/loader/sxp/loader.py

# pylint: disable=duplicate-code
"""
This module implements the SXP data loader.
This loader currently supports the SXP momentum microscope instrument.
The raw hdf5 data is combined and saved into buffer files and loaded as a dask dataframe.
The dataframe is a amalgamation of all h5 files for a combination of runs, where the NaNs are
automatically forward filled across different files.
This can then be saved as a parquet for out-of-sed processing and reread back to access other
sed functionality.
Most of the structure is identical to the FLASH loader.
"""
from __future__ import annotations

import time
from collections.abc import Sequence
from functools import reduce
from pathlib import Path

import dask.dataframe as dd
import h5py
import numpy as np
import pyarrow.parquet as pq
from joblib import delayed
from joblib import Parallel
from natsort import natsorted
from pandas import DataFrame
from pandas import MultiIndex
from pandas import Series

from sed.core import dfops
from sed.core.logging import set_verbosity
from sed.core.logging import setup_logging
from sed.loader.base.loader import BaseLoader
from sed.loader.utils import parse_h5_keys
from sed.loader.utils import split_dld_time_from_sector_id

# Configure logging
logger = setup_logging("sxp_loader")


class SXPLoader(BaseLoader):
    """
    The class generates multiindexed multidimensional pandas dataframes from the new SXP
    dataformat resolved by both macro and microbunches alongside electrons.
    Only the read_dataframe (inherited and implemented) method is accessed by other modules.

    Args:
        config (dict): Config dictionary.
        verbose (bool, optional): Option to print out diagnostic information.
    """

    __name__ = "sxp"

    supported_file_types = ["h5"]

    def __init__(self, config: dict, verbose: bool = True) -> None:
        super().__init__(config=config, verbose=verbose)

        set_verbosity(logger, self._verbose)

        self.multi_index = ["trainId", "pulseId", "electronId"]
        self.index_per_electron: MultiIndex = None
        self.index_per_pulse: MultiIndex = None
        self.failed_files_error: list[str] = []
        self.array_indices: list[list[slice]] = None
        self.raw_dir: str = None
        self.processed_dir: str = None

    @property
    def verbose(self) -> bool:
        """Accessor to the verbosity flag.

        Returns:
            bool: Verbosity flag.
        """
        return self._verbose

    @verbose.setter
    def verbose(self, verbose: bool):
        """Setter for the verbosity.

        Args:
            verbose (bool): Option to turn on verbose output. Sets loglevel to INFO.
        """
        self._verbose = verbose
        set_verbosity(logger, self._verbose)

    def _initialize_dirs(self):
        """
        Initializes the paths based on the configuration.

        Raises:
            ValueError: If required values are missing from the configuration.
            FileNotFoundError: If the raw data directories are not found.
        """
        # Parses to locate the raw beamtime directory from config file
        if (
            "paths" in self._config["core"]
            and self._config["core"]["paths"].get("raw", "")
            and self._config["core"]["paths"].get("processed", "")
        ):
            data_raw_dir = [
                Path(self._config["core"]["paths"].get("raw", "")),
            ]
            data_parquet_dir = Path(
                self._config["core"]["paths"].get("processed", ""),
            )

        else:
            try:
                beamtime_id = self._config["core"]["beamtime_id"]
                year = self._config["core"]["year"]
            except KeyError as exc:
                raise ValueError(
                    "The beamtime_id and year are required.",
                ) from exc

            beamtime_dir = Path(
                self._config["core"]["beamtime_dir"][self._config["core"]["beamline"]],
            )
            beamtime_dir = beamtime_dir.joinpath(f"{year}/{beamtime_id}/")

            if not beamtime_dir.joinpath("raw").is_dir():
                raise FileNotFoundError("Raw data directory not found.")

            data_raw_dir = [beamtime_dir.joinpath("raw")]

            parquet_path = "processed/parquet"
            data_parquet_dir = beamtime_dir.joinpath(parquet_path)

        data_parquet_dir.mkdir(parents=True, exist_ok=True)

        self.raw_dir = data_raw_dir
        self.processed_dir = data_parquet_dir

    def get_files_from_run_id(
        self,
        run_id: str,
        folders: str | Sequence[str] = None,
        extension: str = "h5",
        **kwds,
    ) -> list[str]:
        """Returns a list of filenames for a given run located in the specified directory
        for the specified data acquisition (daq).

        Args:
            run_id (str): The run identifier to locate.
            folders (str | Sequence[str], optional): The directory(ies) where the raw
                data is located. Defaults to config["core"]["base_folder"].
            extension (str, optional): The file extension. Defaults to "h5".
            kwds: Keyword arguments:
                - daq (str): The data acquisition identifier.

        Returns:
            list[str]: A list of path strings representing the collected file names.

        Raises:
            FileNotFoundError: If no files are found for the given run in the directory.
        """
        # Define the stream name prefixes based on the data acquisition identifier
        stream_name_prefixes = self._config["core"]["stream_name_prefixes"]
        stream_name_postfixes = self._config["core"].get("stream_name_postfixes", {})

        if isinstance(run_id, (int, np.integer)):
            run_id = str(run_id).zfill(4)

        if folders is None:
            folders = self._config["core"]["base_folder"]

        if isinstance(folders, str):
            folders = [folders]

        daq = kwds.pop("daq", self._config.get("dataframe", {}).get("daq"))

        if len(kwds) > 0:
            raise TypeError(
                f"get_files_from_run_id() got unexpected keyword arguments {kwds.keys()}.",
            )

        stream_name_postfix = stream_name_postfixes.get(daq, "")
        # Generate the file patterns to search for in the directory
        file_pattern = f"**/{stream_name_prefixes[daq]}{run_id}{stream_name_postfix}*." + extension

        files: list[Path] = []
        # Use pathlib to search for matching files in each directory
        for folder in folders:
            files.extend(
                natsorted(
                    Path(folder).glob(file_pattern),
                    key=lambda filename: str(filename).rsplit("_", maxsplit=1)[-1],
                ),
            )

        # Check if any files are found
        if not files:
            raise FileNotFoundError(
                f"No files found for run {run_id} in directory {str(folders)}",
            )

        # Return the list of found files
        return [str(file.resolve()) for file in files]

    @property
    def available_channels(self) -> list:
        """Returns the channel names that are available for use,
        excluding pulseId, defined by the json file"""
        available_channels = list(self._config["dataframe"]["channels"].keys())
        available_channels.remove("pulseId")
        available_channels.remove("trainId")
        return available_channels

    def get_channels(self, formats: str | list[str] = "", index: bool = False) -> list[str]:
        """
        Returns a list of channels associated with the specified format(s).

        Args:
            formats (str | list[str]): The desired format(s)
                ('per_pulse', 'per_electron', 'per_train', 'all').
            index (bool): If True, includes channels from the multi_index.

        Returns:
            List[str]: A list of channels with the specified format(s).
        """
        # If 'formats' is a single string, convert it to a list for uniform processing.
        if isinstance(formats, str):
            formats = [formats]

        # If 'formats' is a string "all", gather all possible formats.
        if formats == ["all"]:
            channels = self.get_channels(["per_pulse", "per_train", "per_electron"], index)
            return channels

        channels = []
        for format_ in formats:
            # Gather channels based on the specified format(s).
            channels.extend(
                key
                for key in self.available_channels
                if self._config["dataframe"]["channels"][key]["format"] == format_
                and key != "dldAux"
            )
            # Include 'dldAuxChannels' if the format is 'per_pulse'.
            if format_ == "per_pulse" and "dldAux" in self._config["dataframe"]["channels"]:
                channels.extend(
                    self._config["dataframe"]["channels"]["dldAux"]["dldAuxChannels"].keys(),
                )

        # Include channels from multi_index if 'index' is True.
        if index:
            channels.extend(self.multi_index)

        return channels

    def reset_multi_index(self) -> None:
        """Resets the index per pulse and electron"""
        self.index_per_electron = None
        self.index_per_pulse = None
        self.array_indices = None

    def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
        """
        Creates an index per electron using pulseId for usage with the electron
            resolved pandas DataFrame.

        Args:
            h5_file (h5py.File): The HDF5 file object.

        Notes:
            - This method relies on the 'pulseId' channel to determine
                the macrobunch IDs.
            - It creates a MultiIndex with trainId, pulseId, and electronId
                as the index levels.
        """

        # relative macrobunch IDs obtained from the trainId channel
        train_id, mab_array = self.create_numpy_array_per_channel(
            h5_file,
            "trainId",
        )
        # Internal microbunch IDs obtained from the pulseId channel
        train_id, mib_array = self.create_numpy_array_per_channel(
            h5_file,
            "pulseId",
        )

        # Chopping data into trains
        macrobunch_index = []
        microbunch_ids = []
        macrobunch_indices = []
        for i in train_id.index:
            # removing broken trailing hit copies
            num_trains = self._config["dataframe"].get("num_trains", 0)
            num_pulses = self._config["dataframe"].get("num_pulses", 0)
            if num_trains:
                try:
                    num_valid_hits = np.where(np.diff(mib_array[i].astype(np.int32)) < 0)[0][
                        num_trains - 1
                    ]
                    mab_array[i, num_valid_hits:] = 0
                    mib_array[i, num_valid_hits:] = 0
                except IndexError:
                    pass
            train_ends = np.where(np.diff(mib_array[i].astype(np.int32)) < -1)[0]
            indices = []
            index = 0
            for train, train_end in enumerate(train_ends):
                macrobunch_index.append(train_id[i] + np.uint(train))
                if num_pulses:
                    microbunch_ids.append(mib_array[i, index:train_end] % num_pulses)
                else:
                    microbunch_ids.append(mib_array[i, index:train_end])
                indices.append(slice(index, train_end))
                index = train_end + 1
            macrobunch_indices.append(indices)
        self.array_indices = macrobunch_indices
        # Create a series with the macrobunches as index and
        # microbunches as values
        macrobunches = (
            Series(
                (microbunch_ids[i] for i in range(len(macrobunch_index))),
                name="pulseId",
                index=macrobunch_index,
            )
            - self._config["dataframe"]["ubid_offset"]
        )

        # Explode dataframe to get all microbunch vales per macrobunch,
        # remove NaN values and convert to type int
        microbunches = macrobunches.explode().dropna().astype(int)

        # Create temporary index values
        index_temp = MultiIndex.from_arrays(
            (microbunches.index, microbunches.values),
            names=["trainId", "pulseId"],
        )

        # Calculate the electron counts per pulseId unique preserves the order of appearance
        electron_counts = index_temp.value_counts()[index_temp.unique()].values

        # Series object for indexing with electrons
        electrons = (
            Series(
                [np.arange(electron_counts[i]) for i in range(electron_counts.size)],
            )
            .explode()
            .astype(int)
        )

        # Create a pandas MultiIndex using the exploded datasets
        self.index_per_electron = MultiIndex.from_arrays(
            (microbunches.index, microbunches.values, electrons),
            names=self.multi_index,
        )

    def create_multi_index_per_pulse(
        self,
        train_id: Series,
        np_array: np.ndarray,
    ) -> None:
        """
        Creates an index per pulse using a pulse resolved channel's macrobunch ID, for usage with
        the pulse resolved pandas DataFrame.

        Args:
            train_id (Series): The train ID Series.
            np_array (np.ndarray): The numpy array containing the pulse resolved data.

        Notes:
            - This method creates a MultiIndex with trainId and pulseId as the index levels.
        """

        # Create a pandas MultiIndex, useful for comparing electron and
        # pulse resolved dataframes
        self.index_per_pulse = MultiIndex.from_product(
            (train_id, np.arange(0, np_array.shape[1])),
            names=["trainId", "pulseId"],
        )

    def create_numpy_array_per_channel(
        self,
        h5_file: h5py.File,
        channel: str,
    ) -> tuple[Series, np.ndarray]:
        """
        Returns a numpy array for a given channel name for a given file.

        Args:
            h5_file (h5py.File): The h5py file object.
            channel (str): The name of the channel.

        Returns:
            tuple[Series, np.ndarray]: A tuple containing the train ID Series and the numpy array
            for the channel's data.

        """
        # Get the data from the necessary h5 file and channel
        dataset = h5_file[self._config["dataframe"]["channels"][channel]["dataset_key"]]
        index = h5_file[self._config["dataframe"]["channels"][channel]["index_key"]]

        channel_dict = self._config["dataframe"]["channels"][channel]  # channel parameters

        train_id = Series(index, name="trainId")  # macrobunch

        # unpacks the data into np.ndarray
        np_array = dataset[()]
        if len(np_array.shape) == 2 and self._config["dataframe"]["channels"][channel].get(
            "max_hits",
            0,
        ):
            np_array = np_array[:, : self._config["dataframe"]["channels"][channel]["max_hits"]]

        # Use predefined axis and slice from the json file
        # to choose correct dimension for necessary channel
        if "slice" in channel_dict:
            np_array = np.take(
                np_array,
                channel_dict["slice"],
                axis=1,
            )

        if "scale" in channel_dict:
            np_array = np_array / float(channel_dict["scale"])

        return train_id, np_array

    def create_dataframe_per_electron(
        self,
        np_array: np.ndarray,
        channel: str,
    ) -> DataFrame:
        """
        Returns a pandas DataFrame for a given channel name of type [per electron].

        Args:
            np_array (np.ndarray): The numpy array containing the channel data.
            channel (str): The name of the channel.

        Returns:
            DataFrame: The pandas DataFrame for the channel's data.

        Notes:
            The microbunch resolved data is exploded and converted to a DataFrame. The MultiIndex
            is set, and the NaN values are dropped, alongside the pulseId = 0 (meaningless).

        """
        if self.array_indices is None or len(self.array_indices) != np_array.shape[0]:
            raise RuntimeError(
                "macrobunch_indices not set correctly, internal inconsistency detected.",
            )
        train_data = []
        for i, _ in enumerate(self.array_indices):
            for indices in self.array_indices[i]:
                train_data.append(np_array[i, indices])
        return (
            Series((train for train in train_data), name=channel)
            .explode()
            .dropna()
            .to_frame()
            .set_index(self.index_per_electron)
            .drop(
                index=np.arange(-self._config["dataframe"]["ubid_offset"], 0),
                level=1,
                errors="ignore",
            )
        )

    def create_dataframe_per_pulse(
        self,
        np_array: np.ndarray,
        train_id: Series,
        channel: str,
        channel_dict: dict,
    ) -> DataFrame:
        """
        Returns a pandas DataFrame for a given channel name of type [per pulse].

        Args:
            np_array (np.ndarray): The numpy array containing the channel data.
            train_id (Series): The train ID Series.
            channel (str): The name of the channel.
            channel_dict (dict): The dictionary containing channel parameters.

        Returns:
            DataFrame: The pandas DataFrame for the channel's data.

        Notes:
            - For auxiliary channels, the macrobunch resolved data is repeated 499 times to be
              compared to electron resolved data for each auxiliary channel. The data is then
              converted to a multicolumn DataFrame.
            - For all other pulse resolved channels, the macrobunch resolved data is exploded
              to a DataFrame and the MultiIndex is set.

        """

        # Special case for auxiliary channels
        if channel == "dldAux":
            # Checks the channel dictionary for correct slices and creates a multicolumn DataFrame
            data_frames = (
                Series(
                    (np_array[i, value] for i in train_id.index),
                    name=key,
                    index=train_id,
                ).to_frame()
                for key, value in channel_dict["dldAuxChannels"].items()
            )

            # Multiindex set and combined dataframe returned
            data = reduce(DataFrame.combine_first, data_frames)

        # For all other pulse resolved channels
        else:
            # Macrobunch resolved data is exploded to a DataFrame and the MultiIndex is set

            # Creates the index_per_pulse for the given channel
            self.create_multi_index_per_pulse(train_id, np_array)
            data = (
                Series((np_array[i] for i in train_id.index), name=channel)
                .explode()
                .to_frame()
                .set_index(self.index_per_pulse)
            )

        return data

    def create_dataframe_per_train(
        self,
        np_array: np.ndarray,
        train_id: Series,
        channel: str,
    ) -> DataFrame:
        """
        Returns a pandas DataFrame for a given channel name of type [per train].

        Args:
            np_array (np.ndarray): The numpy array containing the channel data.
            train_id (Series): The train ID Series.
            channel (str): The name of the channel.

        Returns:
            DataFrame: The pandas DataFrame for the channel's data.
        """
        return (
            Series((np_array[i] for i in train_id.index), name=channel)
            .to_frame()
            .set_index(train_id)
        )

    def create_dataframe_per_channel(
        self,
        file_path: Path,
        channel: str,
    ) -> Series | DataFrame:
        """
        Returns a pandas DataFrame for a given channel name from a given file.

        This method takes an h5py.File object `h5_file` and a channel name `channel`, and returns
        a pandas DataFrame containing the data for that channel from the file. The format of the
        DataFrame depends on the channel's format specified in the configuration.

        Args:
            file_path (Path): The path to the main HDF5 file.
            channel (str): The name of the channel.

        Returns:
            Series | DataFrame: A pandas Series or DataFrame representing the channel's data.

        Raises:
            ValueError: If the channel has an undefined format.

        """
        channel_dict = self._config["dataframe"]["channels"][channel]  # channel parameters
        main_daq = self._config["dataframe"]["daq"]
        channel_daq = self._config["dataframe"]["channels"][channel].get("daq", main_daq)
        # load file corresponding to daq
        h5_file = h5py.File(Path(str(file_path).replace(main_daq, channel_daq)))

        [train_id, np_array] = self.create_numpy_array_per_channel(
            h5_file,
            channel,
        )  # numpy Array created

        # If np_array is size zero, fill with NaNs
        if np_array.size == 0:
            # Fill the np_array with NaN values of the same shape as train_id
            np_array = np.full_like(train_id, np.nan, dtype=np.double)
            # Create a Series using np_array, with train_id as the index
            data = Series(
                (np_array[i] for i in train_id.index),
                name=channel,
                index=train_id,
            )

        # Electron resolved data is treated here
        if channel_dict["format"] == "per_electron":
            # If index_per_electron is None, create it for the given file
            if self.index_per_electron is None:
                self.create_multi_index_per_electron(h5_file)

            # Create a DataFrame for electron-resolved data
            data = self.create_dataframe_per_electron(
                np_array,
                channel,
            )

        # Pulse resolved data is treated here
        elif channel_dict["format"] == "per_pulse":
            # Create a DataFrame for pulse-resolved data
            data = self.create_dataframe_per_pulse(
                np_array,
                train_id,
                channel,
                channel_dict,
            )

        # Train resolved data is treated here
        elif channel_dict["format"] == "per_train":
            # Create a DataFrame for train-resolved data
            data = self.create_dataframe_per_train(np_array, train_id, channel)

        else:
            raise ValueError(
                channel
                + "has an undefined format. Available formats are \
                per_pulse, per_electron and per_train",
            )

        return data

    def concatenate_channels(
        self,
        file_path: Path,
    ) -> DataFrame:
        """
        Concatenates the channels from the provided h5py.File into a pandas DataFrame.

        This method takes an h5py.File object `h5_file` and concatenates the channels present in
        the file into a single pandas DataFrame. The concatenation is performed based on the
        available channels specified in the configuration.

        Args:
            file_path (Path): The path to the main HDF5 file.

        Returns:
            DataFrame: A concatenated pandas DataFrame containing the channels.

        Raises:
            ValueError: If the group_name for any channel does not exist in the file.

        """
        # Check for if the provided dataset_keys and index_keys actually exists in the file
        for channel in self._config["dataframe"]["channels"]:
            dataset_key = self._config["dataframe"]["channels"][channel]["dataset_key"]
            daq = self._config["dataframe"]["channels"][channel].get("daq", "DA03")
            # load file corresponding to daq
            h5_file = h5py.File(Path(str(file_path).replace("DA03", daq)))
            all_keys = parse_h5_keys(h5_file)  # Parses all channels present
            if dataset_key not in all_keys:
                raise ValueError(
                    f"The dataset_key for channel {channel} does not exist.",
                )
            index_key = self._config["dataframe"]["channels"][channel]["index_key"]
            if index_key not in all_keys:
                raise ValueError(
                    f"The index_key for channel {channel} does not exist.",
                )

        # Create a generator expression to generate data frames for each channel
        data_frames = (
            self.create_dataframe_per_channel(file_path, each) for each in self.available_channels
        )

        # Use the reduce function to join the data frames into a single DataFrame
        return reduce(
            lambda left, right: left.join(right, how="outer"),
            data_frames,
        )

    def create_dataframe_per_file(
        self,
        file_path: Path,
    ) -> DataFrame:
        """
        Create pandas DataFrames for the given file.

        This method loads an HDF5 file specified by `file_path` and constructs a pandas DataFrame
        from the datasets within the file. The order of datasets in the DataFrames is the opposite
        of the order specified by channel names.

        Args:
            file_path (Path): Path to the input HDF5 file.

        Returns:
            DataFrame: pandas DataFrame

        """
        # Loads h5 file and creates a dataframe
        self.reset_multi_index()  # Reset MultiIndexes for next file
        df = self.concatenate_channels(file_path)
        df = df.dropna(subset=self._config["dataframe"]["columns"].get("tof", "dldTimeSteps"))
        # correct the 3 bit shift which encodes the detector ID in the 8s time
        if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
            df, _ = split_dld_time_from_sector_id(df, config=self._config)
        return df

    def create_buffer_file(self, h5_path: Path, parquet_path: Path) -> bool | Exception:
        """
        Converts an HDF5 file to Parquet format to create a buffer file.

        This method uses `create_dataframe_per_file` method to create dataframes from individual
        files within an HDF5 file. The resulting dataframe is then saved to a Parquet file.

        Args:
            h5_path (Path): Path to the input HDF5 file.
            parquet_path (Path): Path to the output Parquet file.

        Returns:
            bool | Exception: Collected exceptions if any.

        Raises:
            ValueError: If an error occurs during the conversion process.

        """
        try:
            (
                self.create_dataframe_per_file(h5_path)
                .reset_index(level=self.multi_index)
                .to_parquet(parquet_path, index=False)
            )
        except Exception as exc:  # pylint: disable=broad-except
            self.failed_files_error.append(f"{parquet_path}: {type(exc)} {exc}")
            return exc
        return None

    def buffer_file_handler(
        self,
        data_parquet_dir: Path,
        detector: str,
        force_recreate: bool,
    ) -> tuple[list[Path], list, list]:
        """
        Handles the conversion of buffer files (h5 to parquet) and returns the filenames.

        Args:
            data_parquet_dir (Path): Directory where the parquet files will be stored.
            detector (str): Detector name.
            force_recreate (bool): Forces recreation of buffer files

        Returns:
            tuple[list[Path], list, list]: Three lists, one for
            parquet file paths, one for metadata and one for schema.

        Raises:
            FileNotFoundError: If the conversion fails for any files or no data is available.
        """

        # Create the directory for buffer parquet files
        buffer_file_dir = data_parquet_dir.joinpath("buffer")
        buffer_file_dir.mkdir(parents=True, exist_ok=True)

        # Create two separate lists for h5 and parquet file paths
        h5_filenames = [Path(file) for file in self.files]
        parquet_filenames = [
            buffer_file_dir.joinpath(Path(file).stem + detector) for file in self.files
        ]
        existing_parquet_filenames = [file for file in parquet_filenames if file.exists()]

        # Raise a value error if no data is available after the conversion
        if len(h5_filenames) == 0:
            raise ValueError("No data available. Probably failed reading all h5 files")

        if not force_recreate:
            # Check if the available channels match the schema of the existing parquet files
            parquet_schemas = [pq.read_schema(file) for file in existing_parquet_filenames]
            config_schema = set(self.get_channels(formats="all", index=True))
            if self._config["dataframe"].get("split_sector_id_from_dld_time", False):
                config_schema.add(self._config["dataframe"]["columns"].get("sector_id", False))

            for i, schema in enumerate(parquet_schemas):
                schema_set = set(schema.names)
                if schema_set != config_schema:
                    missing_in_parquet = config_schema - schema_set
                    missing_in_config = schema_set - config_schema

                    missing_in_parquet_str = (
                        f"Missing in parquet: {missing_in_parquet}" if missing_in_parquet else ""
                    )
                    missing_in_config_str = (
                        f"Missing in config: {missing_in_config}" if missing_in_config else ""
                    )

                    raise ValueError(
                        "The available channels do not match the schema of file",
                        f"{existing_parquet_filenames[i]}",
                        f"{missing_in_parquet_str}",
                        f"{missing_in_config_str}",
                        "Please check the configuration file or set force_recreate to True.",
                    )

        # Choose files to read
        files_to_read = [
            (h5_path, parquet_path)
            for h5_path, parquet_path in zip(h5_filenames, parquet_filenames)
            if force_recreate or not parquet_path.exists()
        ]

        print(f"Reading files: {len(files_to_read)} new files of {len(h5_filenames)} total.")

        # Initialize the indices for create_buffer_file conversion
        self.reset_multi_index()

        # Convert the remaining h5 files to parquet in parallel if there are any
        if len(files_to_read) > 0:
            error = Parallel(n_jobs=len(files_to_read), verbose=10)(
                delayed(self.create_buffer_file)(h5_path, parquet_path)
                for h5_path, parquet_path in files_to_read
            )
            if any(error):
                raise RuntimeError(f"Conversion failed for some files. {error}")
        # for h5_path, parquet_path in files_to_read:
        #     self.create_buffer_file(h5_path, parquet_path)

        # Raise an error if the conversion failed for any files
        # TODO: merge this and the previous error trackings
        if self.failed_files_error:
            raise FileNotFoundError(
                "Conversion failed for the following files:\n" + "\n".join(self.failed_files_error),
            )

        print("All files converted successfully!")

        # read all parquet metadata and schema
        metadata = [pq.read_metadata(file) for file in parquet_filenames]
        schema = [pq.read_schema(file) for file in parquet_filenames]

        return parquet_filenames, metadata, schema

    def parquet_handler(
        self,
        data_parquet_dir: Path,
        detector: str = "",
        parquet_path: Path = None,
        converted: bool = False,
        load_parquet: bool = False,
        save_parquet: bool = False,
        force_recreate: bool = False,
    ) -> tuple[dd.DataFrame, dd.DataFrame]:
        """
        Handles loading and saving of parquet files based on the provided parameters.

        Args:
            data_parquet_dir (Path): Directory where the parquet files are located.
            detector (str, optional): Adds a identifier for parquets to distinguish multidetector
                systems.
            parquet_path (str, optional): Path to the combined parquet file.
            converted (bool, optional): True if data is augmented by adding additional columns
                externally and saved into converted folder.
            load_parquet (bool, optional): Loads the entire parquet into the dd dataframe.
            save_parquet (bool, optional): Saves the entire dataframe into a parquet.
            force_recreate (bool, optional): Forces recreation of buffer file.
        Returns:
            tuple[dd.DataFrame, dd.DataFrame]: A tuple containing two dataframes:
            - dataframe_electron: Dataframe containing the loaded/augmented electron data.
            - dataframe_pulse: Dataframe containing the loaded/augmented timed data.

        Raises:
            FileNotFoundError: If the requested parquet file is not found.

        """

        # Construct the parquet path if not provided
        if parquet_path is None:
            parquet_name = "_".join(str(run) for run in self.runs)
            parquet_dir = data_parquet_dir.joinpath("converted") if converted else data_parquet_dir

            parquet_path = parquet_dir.joinpath(
                "run_" + parquet_name + detector,
            ).with_suffix(".parquet")

        # Check if load_parquet is flagged and then load the file if it exists
        if load_parquet:
            try:
                dataframe = dd.read_parquet(parquet_path)
            except Exception as exc:
                raise FileNotFoundError(
                    "The final parquet for this run(s) does not exist yet. "
                    "If it is in another location, please provide the path as parquet_path.",
                ) from exc

        else:
            # Obtain the parquet filenames, metadata and schema from the method
            # which handles buffer file creation/reading
            filenames, metadata, _ = self.buffer_file_handler(
                data_parquet_dir,
                detector,
                force_recreate,
            )

            # Read all parquet files into one dataframe using dask
            dataframe = dd.read_parquet(filenames, calculate_divisions=True)

            # Channels to fill NaN values
            channels: list[str] = self.get_channels(["per_pulse", "per_train"])

            overlap = min(file.num_rows for file in metadata)

            print("Filling nan values...")
            dataframe = dfops.forward_fill_lazy(
                df=dataframe,
                columns=channels,
                before=overlap,
                iterations=self._config["dataframe"].get("forward_fill_iterations", 2),
            )
            # Remove the NaNs from per_electron channels
            dataframe_electron = dataframe.dropna(
                subset=self.get_channels(["per_electron"]),
            )
            dataframe_pulse = dataframe[
                self.multi_index + self.get_channels(["per_pulse", "per_train"])
            ]
            dataframe_pulse = dataframe_pulse[
                (dataframe_pulse["electronId"] == 0) | (np.isnan(dataframe_pulse["electronId"]))
            ]

        # Save the dataframe as parquet if requested
        if save_parquet:
            dataframe_electron.compute().reset_index(drop=True).to_parquet(parquet_path)
            print("Combined parquet file saved.")

        return dataframe_electron, dataframe_pulse

    def gather_metadata(self, metadata: dict = None) -> dict:
        """Dummy function returning empty metadata dictionary for now.

        Args:
            metadata (dict, optional): Manual meta data dictionary. Auto-generated
                meta data are added to it. Defaults to None.

        Returns:
            dict: Metadata dictionary
        """
        if metadata is None:
            metadata = {}

        return metadata

    def get_count_rate(
        self,
        fids: Sequence[int] = None,  # noqa: ARG002
        **kwds,  # noqa: ARG002
    ):
        return None, None

    def get_elapsed_time(self, fids=None, **kwds):  # noqa: ARG002
        return None

    def read_dataframe(
        self,
        files: str | Sequence[str] = None,
        folders: str | Sequence[str] = None,
        runs: str | Sequence[str] = None,
        ftype: str = "h5",
        metadata: dict = None,
        collect_metadata: bool = False,
        **kwds,
    ) -> tuple[dd.DataFrame, dd.DataFrame, dict]:
        """
        Read express data from the DAQ, generating a parquet in between.

        Args:
            files (str | Sequence[str], optional): File path(s) to process. Defaults to None.
            folders (str | Sequence[str], optional): Path to folder(s) where files are stored
                Path has priority such that if it's specified, the specified files will be ignored.
                Defaults to None.
            runs (str | Sequence[str], optional): Run identifier(s). Corresponding files will
                be located in the location provided by ``folders``. Takes precedence over
                ``files`` and ``folders``. Defaults to None.
            ftype (str, optional): The file extension type. Defaults to "h5".
            metadata (dict, optional): Additional metadata. Defaults to None.
            collect_metadata (bool, optional): Whether to collect metadata. Defaults to False.
            **kwds: Keyword arguments passed to ``parquet_handler``.

        Returns:
            tuple[dd.DataFrame, dd.DataFrame, dict]: A tuple containing the concatenated DataFrame,
            timed DataFrame, and metadata.

        Raises:
            ValueError: If neither 'runs' nor 'files'/'data_raw_dir' is provided.
            FileNotFoundError: If the conversion fails for some files or no data is available.
        """
        t0 = time.time()

        self._initialize_dirs()

        # Prepare a list of names for the runs to read and parquets to write
        if runs is not None:
            files = []
            if isinstance(runs, (str, int)):
                runs = [runs]
            for run in runs:
                run_files = self.get_files_from_run_id(
                    run_id=run,
                    folders=[str(Path(folder).resolve()) for folder in self.raw_dir],
                    extension=ftype,
                    daq=self._config["dataframe"]["daq"],
                )
                files.extend(run_files)
            self.runs = list(runs)
            super().read_dataframe(files=files, ftype=ftype)

        else:
            # This call takes care of files and folders. As we have converted runs into files
            # already, they are just stored in the class by this call.
            super().read_dataframe(
                files=files,
                folders=folders,
                ftype=ftype,
                metadata=metadata,
            )

        df, df_timed = self.parquet_handler(Path(self.processed_dir), **kwds)

        if collect_metadata:
            metadata = self.gather_metadata(
                metadata=self.metadata,
            )
        else:
            metadata = self.metadata
        print(f"loading complete in {time.time() - t0: .2f} s")

        return df, df_timed, metadata


LOADER = SXPLoader

OpenCOMPES / sed / 12876831595

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous