11970144474

Committed 22 Nov 2024 09:39AM UTC coverage: 94.037% (+1.1%) from 92.933%

Build # 11970144474

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #14 from SPF-OST/13-create-initial-pipeline-for-processing-one-simulation

added sim processing:

Run Details

183 of 189 new or added lines in 8 files covered. (96.83%)

1 existing line in 1 file now uncovered.

410 of 436 relevant lines covered (94.04%)

1.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.1

/pytrnsys_process/process_sim/process_sim.py

import pathlib as _pl
from dataclasses import dataclass

import pandas as _pd

from pytrnsys_process import file_matcher as fm
from pytrnsys_process import readers, utils
from pytrnsys_process.logger import logger


# TODO test if overlapping colums are allowed if the value are the same # pylint: disable=fixme


@dataclass
class Simulation:
    monthly: _pd.DataFrame
    hourly: _pd.DataFrame
    timestep: _pd.DataFrame
    # TODO: Add results data here. Not sure yet, what this will look like # pylint: disable=fixme


def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:
    """
    Process duplicate columns in a DataFrame, ensuring they contain consistent data.

    This function checks for duplicate column names and verifies that:
    1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
    2. All non-NaN values must be identical across duplicate columns

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame to process

    Returns
    -------
    pandas.DataFrame
        DataFrame with duplicate columns removed, keeping only the first occurrence

    Raises
    ------
    ValueError
        If duplicate columns have:
        - NaN values in one column while having actual values in another at the same index
        - Different non-NaN values at the same index

    https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
    """
    for col in df.columns[df.columns.duplicated(keep=False)]:
        duplicate_cols = df.iloc[:, df.columns == col]

        nan_mask = duplicate_cols.isna()
        value_mask = ~nan_mask
        if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():
            raise ValueError(
                f"Column '{col}' has NaN values in one column while having actual values in another"
            )

        if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():
            raise ValueError(
                f"Column '{col}' has conflicting values at same indices"
            )

    df = df.iloc[:, ~df.columns.duplicated()].copy()
    return df


def process_sim_prt(
        sim_folder: _pl.Path,
) -> Simulation:
    sim_files = utils.get_files([sim_folder])
    prt_reader = readers.PrtReader()
    hourly = []
    monthly = []
    timestep = []

    for sim_file in sim_files:
        if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):
            monthly.append(prt_reader.read_monthly(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):
            hourly.append(prt_reader.read_hourly(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):
            timestep.append(prt_reader.read_step(sim_file))
        else:
            logger.warning("Unknown file type: %s", sim_file.name)

    monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))
    hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))
    timestep_df = handle_duplicate_columns(_pd.concat(timestep, axis=1))
    return Simulation(monthly_df, hourly_df, timestep_df)


def process_sim_using_file_content_prt(
        sim_folder: _pl.Path,
) -> Simulation:
    sim_files = utils.get_files([sim_folder])
    prt_reader = readers.PrtReader()
    hourly = []
    monthly = []
    step = []

    for sim_file in sim_files:
        file_type = fm.get_file_type_using_file_content(sim_file)
        if file_type == fm.FileType.MONTHLY:
            monthly.append(prt_reader.read_monthly(sim_file))
        elif file_type == fm.FileType.HOURLY:
            hourly.append(prt_reader.read_hourly(sim_file))
        elif file_type == fm.FileType.TIMESTEP:
            step.append(prt_reader.read_step(sim_file))
        else:
            logger.warning("Unknown file type: %s", sim_file.name)

    monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))
    hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))
    timestep_df = handle_duplicate_columns(_pd.concat(step, axis=1))
    return Simulation(monthly_df, hourly_df, timestep_df)


def process_sim_csv(
        sim_folder: _pl.Path,
) -> Simulation:
    sim_files = utils.get_files([sim_folder], results_folder_name="converted")
    csv_reader = readers.CsvReader()
    hourly = []
    monthly = []
    timestep = []

    for sim_file in sim_files:
        if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):
            monthly.append(csv_reader.read_csv(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):
            hourly.append(csv_reader.read_csv(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):
            timestep.append(csv_reader.read_csv(sim_file))
        else:
            logger.warning("Unknown file type: %s", sim_file.name)

    monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))
    hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))
    timestep_df = handle_duplicate_columns(_pd.concat(timestep, axis=1))

    return Simulation(monthly_df, hourly_df, timestep_df)

1	import pathlib as _pl	2✔
2	from dataclasses import dataclass	2✔
3
4	import pandas as _pd	2✔
5
6	from pytrnsys_process import file_matcher as fm	2✔
7	from pytrnsys_process import readers, utils	2✔
8	from pytrnsys_process.logger import logger	2✔
9
10
11	# TODO test if overlapping colums are allowed if the value are the same # pylint: disable=fixme
12
13
14	@dataclass	2✔
15	class Simulation:	2✔
16	monthly: _pd.DataFrame	2✔
17	hourly: _pd.DataFrame	2✔
18	timestep: _pd.DataFrame	2✔
19	# TODO: Add results data here. Not sure yet, what this will look like # pylint: disable=fixme
20
21
22	def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:	2✔
23	"""
24	Process duplicate columns in a DataFrame, ensuring they contain consistent data.
25
26	This function checks for duplicate column names and verifies that:
27	1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
28	2. All non-NaN values must be identical across duplicate columns
29
30	Parameters
31	----------
32	df : pandas.DataFrame
33	Input DataFrame to process
34
35	Returns
36	-------
37	pandas.DataFrame
38	DataFrame with duplicate columns removed, keeping only the first occurrence
39
40	Raises
41	------
42	ValueError
43	If duplicate columns have:
44	- NaN values in one column while having actual values in another at the same index
45	- Different non-NaN values at the same index
46
47	https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
48	"""
49	for col in df.columns[df.columns.duplicated(keep=False)]:	2✔
50	duplicate_cols = df.iloc[:, df.columns == col]	2✔
51
52	nan_mask = duplicate_cols.isna()	2✔
53	value_mask = ~nan_mask	2✔
54	if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():	2✔
55	raise ValueError(	2✔
56	f"Column '{col}' has NaN values in one column while having actual values in another"
57	)
58
59	if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():	2✔
60	raise ValueError(	2✔
61	f"Column '{col}' has conflicting values at same indices"
62	)
63
64	df = df.iloc[:, ~df.columns.duplicated()].copy()	2✔
65	return df	2✔
66
67
68	def process_sim_prt(	2✔
69	sim_folder: _pl.Path,
70	) -> Simulation:
71	sim_files = utils.get_files([sim_folder])	2✔
72	prt_reader = readers.PrtReader()	2✔
73	hourly = []	2✔
74	monthly = []	2✔
75	timestep = []	2✔
76
77	for sim_file in sim_files:	2✔
78	if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):	2✔
79	monthly.append(prt_reader.read_monthly(sim_file))	2✔
80	elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):	2✔
81	hourly.append(prt_reader.read_hourly(sim_file))	2✔
82	elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):	2✔
83	timestep.append(prt_reader.read_step(sim_file))	2✔
84	else:
NEW 85	logger.warning("Unknown file type: %s", sim_file.name)	×
86
87	monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))	2✔
88	hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))	2✔
89	timestep_df = handle_duplicate_columns(_pd.concat(timestep, axis=1))	2✔
90	return Simulation(monthly_df, hourly_df, timestep_df)	2✔
91
92
93	def process_sim_using_file_content_prt(	2✔
94	sim_folder: _pl.Path,
95	) -> Simulation:
96	sim_files = utils.get_files([sim_folder])	2✔
97	prt_reader = readers.PrtReader()	2✔
98	hourly = []	2✔
99	monthly = []	2✔
100	step = []	2✔
101
102	for sim_file in sim_files:	2✔
103	file_type = fm.get_file_type_using_file_content(sim_file)	2✔
104	if file_type == fm.FileType.MONTHLY:	2✔
105	monthly.append(prt_reader.read_monthly(sim_file))	2✔
106	elif file_type == fm.FileType.HOURLY:	2✔
107	hourly.append(prt_reader.read_hourly(sim_file))	2✔
108	elif file_type == fm.FileType.TIMESTEP:	2✔
109	step.append(prt_reader.read_step(sim_file))	2✔
110	else:
NEW 111	logger.warning("Unknown file type: %s", sim_file.name)	×
112
113	monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))	2✔
114	hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))	2✔
115	timestep_df = handle_duplicate_columns(_pd.concat(step, axis=1))	2✔
116	return Simulation(monthly_df, hourly_df, timestep_df)	2✔
117
118
119	def process_sim_csv(	2✔
120	sim_folder: _pl.Path,
121	) -> Simulation:
122	sim_files = utils.get_files([sim_folder], results_folder_name="converted")	2✔
123	csv_reader = readers.CsvReader()	2✔
124	hourly = []	2✔
125	monthly = []	2✔
126	timestep = []	2✔
127
128	for sim_file in sim_files:	2✔
129	if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):	2✔
130	monthly.append(csv_reader.read_csv(sim_file))	2✔
131	elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):	2✔
132	hourly.append(csv_reader.read_csv(sim_file))	2✔
133	elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):	2✔
134	timestep.append(csv_reader.read_csv(sim_file))	2✔
135	else:
NEW 136	logger.warning("Unknown file type: %s", sim_file.name)	×
137
138	monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))	2✔
139	hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))	2✔
140	timestep_df = handle_duplicate_columns(_pd.concat(timestep, axis=1))	2✔
141
142	return Simulation(monthly_df, hourly_df, timestep_df)	2✔

SPF-OST / pytrnsys_process / 11970144474

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous