13737128015

Committed 08 Mar 2025 11:31AM UTC coverage: 95.308% (-2.2%) from 97.522%

Build Type

github

Committed by ahobeost

Commit Message

black adjustments

Pull Request Pull Request #77: 76 bug two types of step files one does not get read in leading to the simulation failing entirely

Run Details

31 of 38 new or added lines in 6 files covered. (81.58%)

19 existing lines in 2 files now uncovered.

1097 of 1151 relevant lines covered (95.31%)

1.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.55

/pytrnsys_process/process/process_sim.py

import logging as _logging
import pathlib as _pl
from collections import abc as _abc
from dataclasses import dataclass, field

import pandas as _pd

from pytrnsys_process import config as conf
from pytrnsys_process import deck, log, read, util
from pytrnsys_process.process import data_structures as ds
from pytrnsys_process.process import file_type_detector as ftd


def process_sim(
    sim_files: _abc.Sequence[_pl.Path], sim_folder: _pl.Path
) -> ds.Simulation:
    # Used to store the array of dataframes for each file type.
    # Later used to concatenate all into one dataframe and saving as Sim object
    simulation_data_collector = _SimulationDataCollector()

    sim_logger = log.get_simulation_logger(sim_folder)
    for sim_file in sim_files:
        try:
            _process_file(
                simulation_data_collector,
                sim_file,
                _determine_file_type(sim_file, sim_logger),
            )
        except ValueError as e:
            sim_logger.error(
                "Error reading file %s it will not be available for processing: %s",
                sim_file,
                str(e),
                exc_info=True,
            )
        except KeyError as e:
            sim_logger.error(
                "Error reading file %s it will not be available for processing: %s",
                sim_file,
                str(e),
                exc_info=True,
            )

    return _merge_dataframes_into_simulation(
        simulation_data_collector, sim_folder
    )


def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:
    """
    Process duplicate columns in a DataFrame, ensuring they contain consistent data.

    This function checks for duplicate column names and verifies that:
    1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
    2. All non-NaN values must be identical across duplicate columns

    Parameters
    __________
    df: pandas.DataFrame
        Input DataFrame to process

    Returns
    _______
    df: pandas.DataFrame
        DataFrame with duplicate columns removed, keeping only the first occurrence

    Raises
    ______
    ValueError
        If duplicate columns have:
        1. NaN values in one column while having actual values in another at the same index, or
        2. Different non-NaN values at the same index

    Note
    ____
    https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
    """
    for col in df.columns[df.columns.duplicated(keep=False)]:
        duplicate_cols = df.iloc[:, df.columns == col]

        nan_mask = duplicate_cols.isna()
        value_mask = ~nan_mask
        if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():
            raise ValueError(
                f"Column '{col}' has NaN values in one column while having actual values in another"
            )

        if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():
            raise ValueError(
                f"Column '{col}' has conflicting values at same indices"
            )

    df = df.iloc[:, ~df.columns.duplicated()].copy()
    return df


def _determine_file_type(
    sim_file: _pl.Path, logger: _logging.Logger
) -> conf.FileType:
    """Determine the file type using name and content."""
    try:
        return ftd.get_file_type_using_file_name(sim_file, logger)
    except ValueError:
        return ftd.get_file_type_using_file_content(sim_file, logger)


@dataclass
class _SimulationDataCollector:
    hourly: list[_pd.DataFrame] = field(default_factory=list)
    monthly: list[_pd.DataFrame] = field(default_factory=list)
    step: list[_pd.DataFrame] = field(default_factory=list)
    parsed_deck: _pd.DataFrame = field(default_factory=_pd.DataFrame)


def _read_file(file_path: _pl.Path, file_type: conf.FileType) -> _pd.DataFrame:
    """
    Factory method to read data from a file using the appropriate reader.

    Parameters
    __________
    file_path: pathlib.Path
        Path to the file to be read

    file_type: conf.FileType
        Type of data in the file (MONTHLY, HOURLY, or TIMESTEP)

    Returns
    _______
    pandas.DataFrame
        Data read from the file

    Raises
    ______
    ValueError
        If file extension is not supported
    """
    starting_year = conf.global_settings.reader.starting_year
    extension = file_path.suffix.lower()
    logger = log.get_simulation_logger(file_path.parents[1])
    if extension in [".prt", ".hr"]:
        reader = read.PrtReader()
        if file_type == conf.FileType.MONTHLY:
            return reader.read_monthly(
                file_path, logger=logger, starting_year=starting_year
            )
        if file_type == conf.FileType.HOURLY:
            return reader.read_hourly(
                file_path, logger=logger, starting_year=starting_year
            )
        if file_type == conf.FileType.TIMESTEP:
            return reader.read_step(
                file_path, starting_year=starting_year, skipfooter=23, header=1
            )
        if file_type == conf.FileType.HYDRAULIC:
            return reader.read_step(file_path, starting_year=starting_year)
    elif extension == ".csv":
        return read.CsvReader().read_csv(file_path)

    raise ValueError(f"Unsupported file extension: {extension}")


def _process_file(
    simulation_data_collector: _SimulationDataCollector,
    file_path: _pl.Path,
    file_type: conf.FileType,
) -> bool:
    if file_type == conf.FileType.MONTHLY:
        simulation_data_collector.monthly.append(
            _read_file(file_path, conf.FileType.MONTHLY)
        )
    elif file_type == conf.FileType.HOURLY:
        simulation_data_collector.hourly.append(
            _read_file(file_path, conf.FileType.HOURLY)
        )
    elif (
        file_type == conf.FileType.TIMESTEP
        and conf.global_settings.reader.read_step_files
    ):
        simulation_data_collector.step.append(
            _read_file(file_path, conf.FileType.TIMESTEP)
        )
    elif (
        file_type == conf.FileType.HYDRAULIC
        and conf.global_settings.reader.read_step_files
    ):
        simulation_data_collector.step.append(
            _read_file(file_path, conf.FileType.HYDRAULIC)
        )
    elif (
        file_type == conf.FileType.DECK
        and conf.global_settings.reader.read_deck_files
    ):
        simulation_data_collector.parsed_deck = _get_deck_as_df(file_path)
    else:
        return False
    return True


def _get_deck_as_df(
    file_path: _pl.Path,
) -> _pd.DataFrame:
    deck_file_as_string = util.get_file_content_as_string(file_path)
    parsed_deck: dict[str, float] = deck.parse_deck_for_constant_expressions(
        deck_file_as_string, log.get_simulation_logger(file_path.parent)
    )
    deck_as_df = _pd.DataFrame([parsed_deck])
    return deck_as_df


def _merge_dataframes_into_simulation(
    simulation_data_collector: _SimulationDataCollector, sim_folder: _pl.Path
) -> ds.Simulation:
    monthly_df = _get_df_without_duplicates(simulation_data_collector.monthly)
    hourly_df = _get_df_without_duplicates(simulation_data_collector.hourly)
    timestep_df = _get_df_without_duplicates(simulation_data_collector.step)
    parsed_deck = simulation_data_collector.parsed_deck

    return ds.Simulation(
        sim_folder.as_posix(), monthly_df, hourly_df, timestep_df, parsed_deck
    )


def _get_df_without_duplicates(dfs: _abc.Sequence[_pd.DataFrame]):
    if len(dfs) > 0:
        return handle_duplicate_columns(_pd.concat(dfs, axis=1))

    return _pd.DataFrame()

1	import logging as _logging	2✔
2	import pathlib as _pl	2✔
3	from collections import abc as _abc	2✔
4	from dataclasses import dataclass, field	2✔
5
6	import pandas as _pd	2✔
7
8	from pytrnsys_process import config as conf	2✔
9	from pytrnsys_process import deck, log, read, util	2✔
10	from pytrnsys_process.process import data_structures as ds	2✔
11	from pytrnsys_process.process import file_type_detector as ftd	2✔
12
13
14	def process_sim(	2✔
15	sim_files: _abc.Sequence[_pl.Path], sim_folder: _pl.Path
16	) -> ds.Simulation:
17	# Used to store the array of dataframes for each file type.
18	# Later used to concatenate all into one dataframe and saving as Sim object
19	simulation_data_collector = _SimulationDataCollector()	2✔
20
21	sim_logger = log.get_simulation_logger(sim_folder)	2✔
22	for sim_file in sim_files:	2✔
23	try:	2✔
24	_process_file(	2✔
25	simulation_data_collector,
26	sim_file,
27	_determine_file_type(sim_file, sim_logger),
28	)
29	except ValueError as e:	2✔
30	sim_logger.error(	2✔
31	"Error reading file %s it will not be available for processing: %s",
32	sim_file,
33	str(e),
34	exc_info=True,
35	)
NEW 36	except KeyError as e:	×
NEW 37	sim_logger.error(	×
38	"Error reading file %s it will not be available for processing: %s",
39	sim_file,
40	str(e),
41	exc_info=True,
42	)
43
44	return _merge_dataframes_into_simulation(	2✔
45	simulation_data_collector, sim_folder
46	)
47
48
49	def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:	2✔
50	"""
51	Process duplicate columns in a DataFrame, ensuring they contain consistent data.
52
53	This function checks for duplicate column names and verifies that:
54	1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
55	2. All non-NaN values must be identical across duplicate columns
56
57	Parameters
58	__________
59	df: pandas.DataFrame
60	Input DataFrame to process
61
62	Returns
63	_______
64	df: pandas.DataFrame
65	DataFrame with duplicate columns removed, keeping only the first occurrence
66
67	Raises
68	______
69	ValueError
70	If duplicate columns have:
71	1. NaN values in one column while having actual values in another at the same index, or
72	2. Different non-NaN values at the same index
73
74	Note
75	____
76	https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
77	"""
78	for col in df.columns[df.columns.duplicated(keep=False)]:	2✔
79	duplicate_cols = df.iloc[:, df.columns == col]	2✔
80
81	nan_mask = duplicate_cols.isna()	2✔
82	value_mask = ~nan_mask	2✔
83	if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():	2✔
84	raise ValueError(	2✔
85	f"Column '{col}' has NaN values in one column while having actual values in another"
86	)
87
88	if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():	2✔
89	raise ValueError(	2✔
90	f"Column '{col}' has conflicting values at same indices"
91	)
92
93	df = df.iloc[:, ~df.columns.duplicated()].copy()	2✔
94	return df	2✔
95
96
97	def _determine_file_type(	2✔
98	sim_file: _pl.Path, logger: _logging.Logger
99	) -> conf.FileType:
100	"""Determine the file type using name and content."""
101	try:	2✔
102	return ftd.get_file_type_using_file_name(sim_file, logger)	2✔
103	except ValueError:	2✔
104	return ftd.get_file_type_using_file_content(sim_file, logger)	2✔
105
106
107	@dataclass	2✔
108	class _SimulationDataCollector:	2✔
109	hourly: list[_pd.DataFrame] = field(default_factory=list)	2✔
110	monthly: list[_pd.DataFrame] = field(default_factory=list)	2✔
111	step: list[_pd.DataFrame] = field(default_factory=list)	2✔
112	parsed_deck: _pd.DataFrame = field(default_factory=_pd.DataFrame)	2✔
113
114
115	def _read_file(file_path: _pl.Path, file_type: conf.FileType) -> _pd.DataFrame:	2✔
116	"""
117	Factory method to read data from a file using the appropriate reader.
118
119	Parameters
120	__________
121	file_path: pathlib.Path
122	Path to the file to be read
123
124	file_type: conf.FileType
125	Type of data in the file (MONTHLY, HOURLY, or TIMESTEP)
126
127	Returns
128	_______
129	pandas.DataFrame
130	Data read from the file
131
132	Raises
133	______
134	ValueError
135	If file extension is not supported
136	"""
137	starting_year = conf.global_settings.reader.starting_year	2✔
138	extension = file_path.suffix.lower()	2✔
139	logger = log.get_simulation_logger(file_path.parents[1])	2✔
140	if extension in [".prt", ".hr"]:	2✔
141	reader = read.PrtReader()	2✔
142	if file_type == conf.FileType.MONTHLY:	2✔
143	return reader.read_monthly(	2✔
144	file_path, logger=logger, starting_year=starting_year
145	)
146	if file_type == conf.FileType.HOURLY:	2✔
147	return reader.read_hourly(	2✔
148	file_path, logger=logger, starting_year=starting_year
149	)
150	if file_type == conf.FileType.TIMESTEP:	2✔
151	return reader.read_step(	2✔
152	file_path, starting_year=starting_year, skipfooter=23, header=1
153	)
154	if file_type == conf.FileType.HYDRAULIC:	2✔
155	return reader.read_step(file_path, starting_year=starting_year)	2✔
156	elif extension == ".csv":	2✔
157	return read.CsvReader().read_csv(file_path)	2✔
158
159	raise ValueError(f"Unsupported file extension: {extension}")	×
160
161
162	def _process_file(	2✔
163	simulation_data_collector: _SimulationDataCollector,
164	file_path: _pl.Path,
165	file_type: conf.FileType,
166	) -> bool:
167	if file_type == conf.FileType.MONTHLY:	2✔
168	simulation_data_collector.monthly.append(	2✔
169	_read_file(file_path, conf.FileType.MONTHLY)
170	)
171	elif file_type == conf.FileType.HOURLY:	2✔
172	simulation_data_collector.hourly.append(	2✔
173	_read_file(file_path, conf.FileType.HOURLY)
174	)
175	elif (	2✔
176	file_type == conf.FileType.TIMESTEP
177	and conf.global_settings.reader.read_step_files
178	):
179	simulation_data_collector.step.append(	2✔
180	_read_file(file_path, conf.FileType.TIMESTEP)
181	)
182	elif (	2✔
183	file_type == conf.FileType.HYDRAULIC
184	and conf.global_settings.reader.read_step_files
185	):
186	simulation_data_collector.step.append(	2✔
187	_read_file(file_path, conf.FileType.HYDRAULIC)
188	)
189	elif (	2✔
190	file_type == conf.FileType.DECK
191	and conf.global_settings.reader.read_deck_files
192	):
193	simulation_data_collector.parsed_deck = _get_deck_as_df(file_path)	2✔
194	else:
195	return False	2✔
196	return True	2✔
197
198
199	def _get_deck_as_df(	2✔
200	file_path: _pl.Path,
201	) -> _pd.DataFrame:
202	deck_file_as_string = util.get_file_content_as_string(file_path)	2✔
203	parsed_deck: dict[str, float] = deck.parse_deck_for_constant_expressions(	2✔
204	deck_file_as_string, log.get_simulation_logger(file_path.parent)
205	)
206	deck_as_df = _pd.DataFrame([parsed_deck])	2✔
207	return deck_as_df	2✔
208
209
210	def _merge_dataframes_into_simulation(	2✔
211	simulation_data_collector: _SimulationDataCollector, sim_folder: _pl.Path
212	) -> ds.Simulation:
213	monthly_df = _get_df_without_duplicates(simulation_data_collector.monthly)	2✔
214	hourly_df = _get_df_without_duplicates(simulation_data_collector.hourly)	2✔
215	timestep_df = _get_df_without_duplicates(simulation_data_collector.step)	2✔
216	parsed_deck = simulation_data_collector.parsed_deck	2✔
217
218	return ds.Simulation(	2✔
219	sim_folder.as_posix(), monthly_df, hourly_df, timestep_df, parsed_deck
220	)
221
222
223	def _get_df_without_duplicates(dfs: _abc.Sequence[_pd.DataFrame]):	2✔
224	if len(dfs) > 0:	2✔
225	return handle_duplicate_columns(_pd.concat(dfs, axis=1))	2✔
226
227	return _pd.DataFrame()	2✔

SPF-OST / pytrnsys_process / 13737128015

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous