12275970811

Committed 11 Dec 2024 11:55AM UTC coverage: 95.245%. First build

Build # 12275970811

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #37 from SPF-OST/22-add-support-for-step-files

22 add support for step files

Run Details

129 of 136 new or added lines in 9 files covered. (94.85%)

641 of 673 relevant lines covered (95.25%)

1.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.18

/pytrnsys_process/process_sim/process_sim.py

import pathlib as _pl
from collections import abc as _abc
from dataclasses import dataclass, field

import pandas as _pd

from pytrnsys_process import constants as const
from pytrnsys_process import file_type_detector as ftd
from pytrnsys_process import readers
from pytrnsys_process import settings as sett
from pytrnsys_process.logger import logger


@dataclass
class Simulation:
    """Class representing a TRNSYS simulation with its associated data.

    This class holds the simulation data organized in different time resolutions (monthly, hourly, timestep)
    along with the path to the simulation files.

    Attributes
    ----------
    path : pathlib.Path
        Path to the simulation folder containing the input files
    monthly : pandas.DataFrame
        Monthly aggregated simulation data. Each column represents a different variable
        and each row represents a month.
    hourly : pandas.DataFrame
        Hourly simulation data. Each column represents a different variable
        and each row represents an hour.
    step : pandas.DataFrame
        Simulation data at the smallest timestep resolution. Each column represents
        a different variable and each row represents a timestep.
    """

    path: _pl.Path
    monthly: _pd.DataFrame
    hourly: _pd.DataFrame
    step: _pd.DataFrame
    # TODO: Add results data here. Not sure yet, what this will look like # pylint: disable=fixme


def process_sim(
        sim_files: _abc.Sequence[_pl.Path], sim_folder: _pl.Path
) -> Simulation:
    simulation_data_collector = _SimulationDataCollector()
    for sim_file in sim_files:
        try:
            _process_file(
                simulation_data_collector,
                sim_file,
                _determine_file_type(sim_file),
            )
        except ValueError as e:
            logger.error(
                "Error reading file %s it will not be available for processing: %s",
                sim_file,
                str(e),
                exc_info=True,
            )

    return _merge_dataframes_into_simulation(
        simulation_data_collector, sim_folder
    )


def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:
    """
    Process duplicate columns in a DataFrame, ensuring they contain consistent data.

    This function checks for duplicate column names and verifies that:
    1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
    2. All non-NaN values must be identical across duplicate columns

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame to process

    Returns
    -------
    pandas.DataFrame
        DataFrame with duplicate columns removed, keeping only the first occurrence

    Raises
    ------
    ValueError
        If duplicate columns have:
        - NaN values in one column while having actual values in another at the same index
        - Different non-NaN values at the same index

    https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
    """
    for col in df.columns[df.columns.duplicated(keep=False)]:
        duplicate_cols = df.iloc[:, df.columns == col]

        nan_mask = duplicate_cols.isna()
        value_mask = ~nan_mask
        if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():
            raise ValueError(
                f"Column '{col}' has NaN values in one column while having actual values in another"
            )

        if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():
            raise ValueError(
                f"Column '{col}' has conflicting values at same indices"
            )

    df = df.iloc[:, ~df.columns.duplicated()].copy()
    return df


def _determine_file_type(sim_file: _pl.Path) -> const.FileType:
    """Determine the file type using name and content."""
    try:
        return ftd.get_file_type_using_file_name(sim_file)
    except ValueError:
        return ftd.get_file_type_using_file_content(sim_file)


@dataclass
class _SimulationDataCollector:
    hourly: list[_pd.DataFrame] = field(default_factory=list)
    monthly: list[_pd.DataFrame] = field(default_factory=list)
    step: list[_pd.DataFrame] = field(default_factory=list)


def _read_file(
        file_path: _pl.Path, file_type: const.FileType
) -> _pd.DataFrame:
    """
    Factory method to read data from a file using the appropriate reader.

    Parameters
    ----------
    file_path : pathlib.Path
        Path to the file to be read
    file_type : const.FileType
        Type of data in the file (MONTHLY, HOURLY, or TIMESTEP)

    Returns
    -------
    pandas.DataFrame
        Data read from the file

    Raises
    ------
    ValueError
        If file extension is not supported
    """
    extension = file_path.suffix.lower()
    if extension == ".prt":
        reader = readers.PrtReader()
        if file_type == const.FileType.MONTHLY:
            return reader.read_monthly(file_path)
        if file_type == const.FileType.HOURLY:
            return reader.read_hourly(file_path)
        if file_type == const.FileType.TIMESTEP:
            return reader.read_step(file_path)
    elif extension == ".csv":
        return readers.CsvReader().read_csv(file_path)

    raise ValueError(f"Unsupported file extension: {extension}")


def _process_file(
        simulation_data_collector: _SimulationDataCollector,
        file_path: _pl.Path,
        file_type: const.FileType,
) -> bool:
    if file_type == const.FileType.MONTHLY:
        simulation_data_collector.monthly.append(
            _read_file(file_path, const.FileType.MONTHLY)
        )
    elif file_type == const.FileType.HOURLY:
        simulation_data_collector.hourly.append(
            _read_file(file_path, const.FileType.HOURLY)
        )
    elif (
            file_type == const.FileType.TIMESTEP
            and sett.settings.reader.read_step_files
    ):
        simulation_data_collector.step.append(
            _read_file(file_path, const.FileType.TIMESTEP)
        )
    else:
        return False

    return True


def _merge_dataframes_into_simulation(
        simulation_data_collector: _SimulationDataCollector, sim_folder: _pl.Path
) -> Simulation:
    monthly_df = (
        handle_duplicate_columns(
            _pd.concat(simulation_data_collector.monthly, axis=1)
        )
        if simulation_data_collector.monthly
        else _pd.DataFrame()
    )
    hourly_df = (
        handle_duplicate_columns(
            _pd.concat(simulation_data_collector.hourly, axis=1)
        )
        if simulation_data_collector.hourly
        else _pd.DataFrame()
    )
    timestep_df = (
        handle_duplicate_columns(
            _pd.concat(simulation_data_collector.step, axis=1)
        )
        if simulation_data_collector.step
        else _pd.DataFrame()
    )
    return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)

1	import pathlib as _pl	2✔
2	from collections import abc as _abc	2✔
3	from dataclasses import dataclass, field	2✔
4
5	import pandas as _pd	2✔
6
7	from pytrnsys_process import constants as const	2✔
8	from pytrnsys_process import file_type_detector as ftd	2✔
9	from pytrnsys_process import readers	2✔
10	from pytrnsys_process import settings as sett	2✔
11	from pytrnsys_process.logger import logger	2✔
12
13
14	@dataclass	2✔
15	class Simulation:	2✔
16	"""Class representing a TRNSYS simulation with its associated data.
17
18	This class holds the simulation data organized in different time resolutions (monthly, hourly, timestep)
19	along with the path to the simulation files.
20
21	Attributes
22	----------
23	path : pathlib.Path
24	Path to the simulation folder containing the input files
25	monthly : pandas.DataFrame
26	Monthly aggregated simulation data. Each column represents a different variable
27	and each row represents a month.
28	hourly : pandas.DataFrame
29	Hourly simulation data. Each column represents a different variable
30	and each row represents an hour.
31	step : pandas.DataFrame
32	Simulation data at the smallest timestep resolution. Each column represents
33	a different variable and each row represents a timestep.
34	"""
35
36	path: _pl.Path	2✔
37	monthly: _pd.DataFrame	2✔
38	hourly: _pd.DataFrame	2✔
39	step: _pd.DataFrame	2✔
40	# TODO: Add results data here. Not sure yet, what this will look like # pylint: disable=fixme
41
42
43	def process_sim(	2✔
44	sim_files: _abc.Sequence[_pl.Path], sim_folder: _pl.Path
45	) -> Simulation:
46	simulation_data_collector = _SimulationDataCollector()	2✔
47	for sim_file in sim_files:	2✔
48	try:	2✔
49	_process_file(	2✔
50	simulation_data_collector,
51	sim_file,
52	_determine_file_type(sim_file),
53	)
54	except ValueError as e:	2✔
NEW 55	logger.error(	×
56	"Error reading file %s it will not be available for processing: %s",
57	sim_file,
58	str(e),
59	exc_info=True,
60	)
61
62	return _merge_dataframes_into_simulation(	2✔
63	simulation_data_collector, sim_folder
64	)
65
66
67	def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:	2✔
68	"""
69	Process duplicate columns in a DataFrame, ensuring they contain consistent data.
70
71	This function checks for duplicate column names and verifies that:
72	1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
73	2. All non-NaN values must be identical across duplicate columns
74
75	Parameters
76	----------
77	df : pandas.DataFrame
78	Input DataFrame to process
79
80	Returns
81	-------
82	pandas.DataFrame
83	DataFrame with duplicate columns removed, keeping only the first occurrence
84
85	Raises
86	------
87	ValueError
88	If duplicate columns have:
89	- NaN values in one column while having actual values in another at the same index
90	- Different non-NaN values at the same index
91
92	https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
93	"""
94	for col in df.columns[df.columns.duplicated(keep=False)]:	2✔
95	duplicate_cols = df.iloc[:, df.columns == col]	2✔
96
97	nan_mask = duplicate_cols.isna()	2✔
98	value_mask = ~nan_mask	2✔
99	if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():	2✔
100	raise ValueError(	2✔
101	f"Column '{col}' has NaN values in one column while having actual values in another"
102	)
103
104	if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():	2✔
105	raise ValueError(	2✔
106	f"Column '{col}' has conflicting values at same indices"
107	)
108
109	df = df.iloc[:, ~df.columns.duplicated()].copy()	2✔
110	return df	2✔
111
112
113	def _determine_file_type(sim_file: _pl.Path) -> const.FileType:	2✔
114	"""Determine the file type using name and content."""
115	try:	2✔
116	return ftd.get_file_type_using_file_name(sim_file)	2✔
117	except ValueError:	2✔
118	return ftd.get_file_type_using_file_content(sim_file)	2✔
119
120
121	@dataclass	2✔
122	class _SimulationDataCollector:	2✔
123	hourly: list[_pd.DataFrame] = field(default_factory=list)	2✔
124	monthly: list[_pd.DataFrame] = field(default_factory=list)	2✔
125	step: list[_pd.DataFrame] = field(default_factory=list)	2✔
126
127
128	def _read_file(	2✔
129	file_path: _pl.Path, file_type: const.FileType
130	) -> _pd.DataFrame:
131	"""
132	Factory method to read data from a file using the appropriate reader.
133
134	Parameters
135	----------
136	file_path : pathlib.Path
137	Path to the file to be read
138	file_type : const.FileType
139	Type of data in the file (MONTHLY, HOURLY, or TIMESTEP)
140
141	Returns
142	-------
143	pandas.DataFrame
144	Data read from the file
145
146	Raises
147	------
148	ValueError
149	If file extension is not supported
150	"""
151	extension = file_path.suffix.lower()	2✔
152	if extension == ".prt":	2✔
153	reader = readers.PrtReader()	2✔
154	if file_type == const.FileType.MONTHLY:	2✔
155	return reader.read_monthly(file_path)	2✔
156	if file_type == const.FileType.HOURLY:	2✔
157	return reader.read_hourly(file_path)	2✔
158	if file_type == const.FileType.TIMESTEP:	2✔
159	return reader.read_step(file_path)	2✔
160	elif extension == ".csv":	2✔
161	return readers.CsvReader().read_csv(file_path)	2✔
162
NEW 163	raise ValueError(f"Unsupported file extension: {extension}")	×
164
165
166	def _process_file(	2✔
167	simulation_data_collector: _SimulationDataCollector,
168	file_path: _pl.Path,
169	file_type: const.FileType,
170	) -> bool:
171	if file_type == const.FileType.MONTHLY:	2✔
172	simulation_data_collector.monthly.append(	2✔
173	_read_file(file_path, const.FileType.MONTHLY)
174	)
175	elif file_type == const.FileType.HOURLY:	2✔
176	simulation_data_collector.hourly.append(	2✔
177	_read_file(file_path, const.FileType.HOURLY)
178	)
179	elif (	2✔
180	file_type == const.FileType.TIMESTEP
181	and sett.settings.reader.read_step_files
182	):
183	simulation_data_collector.step.append(	2✔
184	_read_file(file_path, const.FileType.TIMESTEP)
185	)
186	else:
187	return False	2✔
188
189	return True	2✔
190
191
192	def _merge_dataframes_into_simulation(	2✔
193	simulation_data_collector: _SimulationDataCollector, sim_folder: _pl.Path
194	) -> Simulation:
195	monthly_df = (	2✔
196	handle_duplicate_columns(
197	_pd.concat(simulation_data_collector.monthly, axis=1)
198	)
199	if simulation_data_collector.monthly
200	else _pd.DataFrame()
201	)
202	hourly_df = (	2✔
203	handle_duplicate_columns(
204	_pd.concat(simulation_data_collector.hourly, axis=1)
205	)
206	if simulation_data_collector.hourly
207	else _pd.DataFrame()
208	)
209	timestep_df = (	2✔
210	handle_duplicate_columns(
211	_pd.concat(simulation_data_collector.step, axis=1)
212	)
213	if simulation_data_collector.step
214	else _pd.DataFrame()
215	)
216	return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)	2✔

SPF-OST / pytrnsys_process / 12275970811

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous