13560850870

Committed 27 Feb 2025 07:08AM UTC coverage: 98.182% (+0.02%) from 98.165%

Build Type

push

github

Committed by ahobeost

Commit Message

CI adjustments

Run Details

7 of 8 new or added lines in 3 files covered. (87.5%)

4 existing lines in 2 files now uncovered.

1188 of 1210 relevant lines covered (98.18%)

1.95 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.82

/pytrnsys_process/process_sim/process_sim.py

import logging as _logging
import pathlib as _pl
from collections import abc as _abc
from dataclasses import dataclass, field

import pandas as _pd

from pytrnsys_process import constants as const
from pytrnsys_process import data_structures as ds
from pytrnsys_process import file_type_detector as ftd
from pytrnsys_process import logger as log
from pytrnsys_process import readers
from pytrnsys_process import settings as sett
from pytrnsys_process import utils
from pytrnsys_process.deck import extractor
from pytrnsys_process.settings import settings


def process_sim(
    sim_files: _abc.Sequence[_pl.Path], sim_folder: _pl.Path
) -> ds.Simulation:
    # Used to store the array of dataframes for each file type.
    # Later used to concatenate all into one dataframe and saving as Sim object
    simulation_data_collector = _SimulationDataCollector()

    sim_logger = log.get_simulation_logger(sim_folder)
    for sim_file in sim_files:
        try:
            _process_file(
                simulation_data_collector,
                sim_file,
                _determine_file_type(sim_file, sim_logger),
            )
        except ValueError as e:
            sim_logger.error(
                "Error reading file %s it will not be available for processing: %s",
                sim_file,
                str(e),
                exc_info=True,
            )

    return _merge_dataframes_into_simulation(
        simulation_data_collector, sim_folder
    )


def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:
    """
    Process duplicate columns in a DataFrame, ensuring they contain consistent data.

    This function checks for duplicate column names and verifies that:
    1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
    2. All non-NaN values must be identical across duplicate columns

    Parameters
    __________
    df: pandas.DataFrame
        Input DataFrame to process

    Returns
    _______
    df: pandas.DataFrame
        DataFrame with duplicate columns removed, keeping only the first occurrence

    Raises
    ______
    ValueError
        If duplicate columns have:
        1. NaN values in one column while having actual values in another at the same index, or
        2. Different non-NaN values at the same index

    Note
    ____
    https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
    """
    for col in df.columns[df.columns.duplicated(keep=False)]:
        duplicate_cols = df.iloc[:, df.columns == col]

        nan_mask = duplicate_cols.isna()
        value_mask = ~nan_mask
        if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():
            raise ValueError(
                f"Column '{col}' has NaN values in one column while having actual values in another"
            )

        if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():
            raise ValueError(
                f"Column '{col}' has conflicting values at same indices"
            )

    df = df.iloc[:, ~df.columns.duplicated()].copy()
    return df


def _determine_file_type(
    sim_file: _pl.Path, logger: _logging.Logger
) -> const.FileType:
    """Determine the file type using name and content."""
    try:
        return ftd.get_file_type_using_file_name(sim_file, logger)
    except ValueError:
        return ftd.get_file_type_using_file_content(sim_file, logger)


@dataclass
class _SimulationDataCollector:
    hourly: list[_pd.DataFrame] = field(default_factory=list)
    monthly: list[_pd.DataFrame] = field(default_factory=list)
    step: list[_pd.DataFrame] = field(default_factory=list)
    deck: _pd.DataFrame = field(default_factory=_pd.DataFrame)


def _read_file(
    file_path: _pl.Path, file_type: const.FileType
) -> _pd.DataFrame:
    """
    Factory method to read data from a file using the appropriate reader.

    Parameters
    __________
    file_path: pathlib.Path
        Path to the file to be read

    file_type: const.FileType
        Type of data in the file (MONTHLY, HOURLY, or TIMESTEP)

    Returns
    _______
    pandas.DataFrame
        Data read from the file

    Raises
    ______
    ValueError
        If file extension is not supported
    """
    starting_year = settings.reader.starting_year
    extension = file_path.suffix.lower()
    if extension in [".prt", ".hr"]:
        reader = readers.PrtReader()
        if file_type == const.FileType.MONTHLY:
            return reader.read_monthly(file_path, starting_year)
        if file_type == const.FileType.HOURLY:
            return reader.read_hourly(file_path, starting_year)
        if file_type == const.FileType.TIMESTEP:
            return reader.read_step(file_path, starting_year)
    elif extension == ".csv":
        return readers.CsvReader().read_csv(file_path)

    raise ValueError(f"Unsupported file extension: {extension}")


def _process_file(
    simulation_data_collector: _SimulationDataCollector,
    file_path: _pl.Path,
    file_type: const.FileType,
) -> bool:
    if file_type == const.FileType.MONTHLY:
        simulation_data_collector.monthly.append(
            _read_file(file_path, const.FileType.MONTHLY)
        )
    elif file_type == const.FileType.HOURLY:
        simulation_data_collector.hourly.append(
            _read_file(file_path, const.FileType.HOURLY)
        )
    elif (
        file_type == const.FileType.TIMESTEP
        and sett.settings.reader.read_step_files
    ):
        simulation_data_collector.step.append(
            _read_file(file_path, const.FileType.TIMESTEP)
        )
    elif (
        file_type == const.FileType.DECK
        and sett.settings.reader.read_deck_files
    ):
        simulation_data_collector.deck = _get_deck_as_df(file_path)
    else:
        return False
    return True


def _get_deck_as_df(
    file_path: _pl.Path,
) -> _pd.DataFrame:
    deck_file_as_string = utils.get_file_content_as_string(file_path)
    deck: dict[str, float] = extractor.parse_deck_for_constant_expressions(
        deck_file_as_string, log.get_simulation_logger(file_path.parent)
    )
    deck_as_df = _pd.DataFrame([deck])
    return deck_as_df


def _merge_dataframes_into_simulation(
    simulation_data_collector: _SimulationDataCollector, sim_folder: _pl.Path
) -> ds.Simulation:
    monthly_df = _get_df_without_duplicates(simulation_data_collector.monthly)
    hourly_df = _get_df_without_duplicates(simulation_data_collector.hourly)
    timestep_df = _get_df_without_duplicates(simulation_data_collector.step)
    deck = simulation_data_collector.deck

    return ds.Simulation(
        sim_folder.as_posix(), monthly_df, hourly_df, timestep_df, deck
    )


def _get_df_without_duplicates(dfs: _abc.Sequence[_pd.DataFrame]):
    if len(dfs) > 0:
        return handle_duplicate_columns(_pd.concat(dfs, axis=1))

    return _pd.DataFrame()

1	import logging as _logging	2✔
2	import pathlib as _pl	2✔
3	from collections import abc as _abc	2✔
4	from dataclasses import dataclass, field	2✔
5
6	import pandas as _pd	2✔
7
8	from pytrnsys_process import constants as const	2✔
9	from pytrnsys_process import data_structures as ds	2✔
10	from pytrnsys_process import file_type_detector as ftd	2✔
11	from pytrnsys_process import logger as log	2✔
12	from pytrnsys_process import readers	2✔
13	from pytrnsys_process import settings as sett	2✔
14	from pytrnsys_process import utils	2✔
15	from pytrnsys_process.deck import extractor	2✔
16	from pytrnsys_process.settings import settings	2✔
17
18
19	def process_sim(	2✔
20	sim_files: _abc.Sequence[_pl.Path], sim_folder: _pl.Path
21	) -> ds.Simulation:
22	# Used to store the array of dataframes for each file type.
23	# Later used to concatenate all into one dataframe and saving as Sim object
24	simulation_data_collector = _SimulationDataCollector()	2✔
25
26	sim_logger = log.get_simulation_logger(sim_folder)	2✔
27	for sim_file in sim_files:	2✔
28	try:	2✔
29	_process_file(	2✔
30	simulation_data_collector,
31	sim_file,
32	_determine_file_type(sim_file, sim_logger),
33	)
34	except ValueError as e:	2✔
35	sim_logger.error(	2✔
36	"Error reading file %s it will not be available for processing: %s",
37	sim_file,
38	str(e),
39	exc_info=True,
40	)
41
42	return _merge_dataframes_into_simulation(	2✔
43	simulation_data_collector, sim_folder
44	)
45
46
47	def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:	2✔
48	"""
49	Process duplicate columns in a DataFrame, ensuring they contain consistent data.
50
51	This function checks for duplicate column names and verifies that:
52	1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
53	2. All non-NaN values must be identical across duplicate columns
54
55	Parameters
56	__________
57	df: pandas.DataFrame
58	Input DataFrame to process
59
60	Returns
61	_______
62	df: pandas.DataFrame
63	DataFrame with duplicate columns removed, keeping only the first occurrence
64
65	Raises
66	______
67	ValueError
68	If duplicate columns have:
69	1. NaN values in one column while having actual values in another at the same index, or
70	2. Different non-NaN values at the same index
71
72	Note
73	____
74	https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
75	"""
76	for col in df.columns[df.columns.duplicated(keep=False)]:	2✔
77	duplicate_cols = df.iloc[:, df.columns == col]	2✔
78
79	nan_mask = duplicate_cols.isna()	2✔
80	value_mask = ~nan_mask	2✔
81	if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():	2✔
82	raise ValueError(	2✔
83	f"Column '{col}' has NaN values in one column while having actual values in another"
84	)
85
86	if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():	2✔
87	raise ValueError(	2✔
88	f"Column '{col}' has conflicting values at same indices"
89	)
90
91	df = df.iloc[:, ~df.columns.duplicated()].copy()	2✔
92	return df	2✔
93
94
95	def _determine_file_type(	2✔
96	sim_file: _pl.Path, logger: _logging.Logger
97	) -> const.FileType:
98	"""Determine the file type using name and content."""
99	try:	2✔
100	return ftd.get_file_type_using_file_name(sim_file, logger)	2✔
101	except ValueError:	2✔
102	return ftd.get_file_type_using_file_content(sim_file, logger)	2✔
103
104
105	@dataclass	2✔
106	class _SimulationDataCollector:	2✔
107	hourly: list[_pd.DataFrame] = field(default_factory=list)	2✔
108	monthly: list[_pd.DataFrame] = field(default_factory=list)	2✔
109	step: list[_pd.DataFrame] = field(default_factory=list)	2✔
110	deck: _pd.DataFrame = field(default_factory=_pd.DataFrame)	2✔
111
112
113	def _read_file(	2✔
114	file_path: _pl.Path, file_type: const.FileType
115	) -> _pd.DataFrame:
116	"""
117	Factory method to read data from a file using the appropriate reader.
118
119	Parameters
120	__________
121	file_path: pathlib.Path
122	Path to the file to be read
123
124	file_type: const.FileType
125	Type of data in the file (MONTHLY, HOURLY, or TIMESTEP)
126
127	Returns
128	_______
129	pandas.DataFrame
130	Data read from the file
131
132	Raises
133	______
134	ValueError
135	If file extension is not supported
136	"""
137	starting_year = settings.reader.starting_year	2✔
138	extension = file_path.suffix.lower()	2✔
139	if extension in [".prt", ".hr"]:	2✔
140	reader = readers.PrtReader()	2✔
141	if file_type == const.FileType.MONTHLY:	2✔
142	return reader.read_monthly(file_path, starting_year)	2✔
143	if file_type == const.FileType.HOURLY:	2✔
144	return reader.read_hourly(file_path, starting_year)	2✔
145	if file_type == const.FileType.TIMESTEP:	2✔
146	return reader.read_step(file_path, starting_year)	2✔
147	elif extension == ".csv":	2✔
148	return readers.CsvReader().read_csv(file_path)	2✔
149
UNCOV 150	raise ValueError(f"Unsupported file extension: {extension}")	×
151
152
153	def _process_file(	2✔
154	simulation_data_collector: _SimulationDataCollector,
155	file_path: _pl.Path,
156	file_type: const.FileType,
157	) -> bool:
158	if file_type == const.FileType.MONTHLY:	2✔
159	simulation_data_collector.monthly.append(	2✔
160	_read_file(file_path, const.FileType.MONTHLY)
161	)
162	elif file_type == const.FileType.HOURLY:	2✔
163	simulation_data_collector.hourly.append(	2✔
164	_read_file(file_path, const.FileType.HOURLY)
165	)
166	elif (	2✔
167	file_type == const.FileType.TIMESTEP
168	and sett.settings.reader.read_step_files
169	):
170	simulation_data_collector.step.append(	2✔
171	_read_file(file_path, const.FileType.TIMESTEP)
172	)
173	elif (	2✔
174	file_type == const.FileType.DECK
175	and sett.settings.reader.read_deck_files
176	):
177	simulation_data_collector.deck = _get_deck_as_df(file_path)	2✔
178	else:
179	return False	2✔
180	return True	2✔
181
182
183	def _get_deck_as_df(	2✔
184	file_path: _pl.Path,
185	) -> _pd.DataFrame:
186	deck_file_as_string = utils.get_file_content_as_string(file_path)	2✔
187	deck: dict[str, float] = extractor.parse_deck_for_constant_expressions(	2✔
188	deck_file_as_string, log.get_simulation_logger(file_path.parent)
189	)
190	deck_as_df = _pd.DataFrame([deck])	2✔
191	return deck_as_df	2✔
192
193
194	def _merge_dataframes_into_simulation(	2✔
195	simulation_data_collector: _SimulationDataCollector, sim_folder: _pl.Path
196	) -> ds.Simulation:
197	monthly_df = _get_df_without_duplicates(simulation_data_collector.monthly)	2✔
198	hourly_df = _get_df_without_duplicates(simulation_data_collector.hourly)	2✔
199	timestep_df = _get_df_without_duplicates(simulation_data_collector.step)	2✔
200	deck = simulation_data_collector.deck	2✔
201
202	return ds.Simulation(	2✔
203	sim_folder.as_posix(), monthly_df, hourly_df, timestep_df, deck
204	)
205
206
207	def _get_df_without_duplicates(dfs: _abc.Sequence[_pd.DataFrame]):	2✔
208	if len(dfs) > 0:	2✔
209	return handle_duplicate_columns(_pd.concat(dfs, axis=1))	2✔
210
211	return _pd.DataFrame()	2✔

SPF-OST / pytrnsys_process / 13560850870

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous