12050239394

Committed 27 Nov 2024 12:24PM UTC coverage: 90.52% (-3.5%) from 94.037%

Build # 12050239394

Build Type

Pull #18

github

Committed by

sebastian-swob

Commit Message

added batch processing,
added initial example script on how to use the api,
disabled some step tests until requirements are clear

Pull Request Pull Request #18: 15 create example script for per sim interaction

Run Details

119 of 124 new or added lines in 5 files covered. (95.97%)

20 existing lines in 3 files now uncovered.

487 of 538 relevant lines covered (90.52%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

74.36

/pytrnsys_process/process_sim/process_sim.py

import pathlib as _pl
from dataclasses import dataclass

import pandas as _pd

from pytrnsys_process import file_matcher as fm
from pytrnsys_process import readers, utils
from pytrnsys_process.logger import logger


# TODO test if overlapping colums are allowed if the value are the same # pylint: disable=fixme


@dataclass
class Simulation:
    path: _pl.Path
    monthly: _pd.DataFrame
    hourly: _pd.DataFrame
    timestep: _pd.DataFrame
    # TODO: Add results data here. Not sure yet, what this will look like # pylint: disable=fixme


def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:
    """
    Process duplicate columns in a DataFrame, ensuring they contain consistent data.

    This function checks for duplicate column names and verifies that:
    1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
    2. All non-NaN values must be identical across duplicate columns

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame to process

    Returns
    -------
    pandas.DataFrame
        DataFrame with duplicate columns removed, keeping only the first occurrence

    Raises
    ------
    ValueError
        If duplicate columns have:
        - NaN values in one column while having actual values in another at the same index
        - Different non-NaN values at the same index

    https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
    """
    for col in df.columns[df.columns.duplicated(keep=False)]:
        duplicate_cols = df.iloc[:, df.columns == col]

        nan_mask = duplicate_cols.isna()
        value_mask = ~nan_mask
        if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():
            raise ValueError(
                f"Column '{col}' has NaN values in one column while having actual values in another"
            )

        if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():
            raise ValueError(
                f"Column '{col}' has conflicting values at same indices"
            )

    df = df.iloc[:, ~df.columns.duplicated()].copy()
    return df


def process_sim_prt(
    sim_folder: _pl.Path,
) -> Simulation:
    sim_files = utils.get_files([sim_folder])
    prt_reader = readers.PrtReader()
    hourly = []
    monthly = []
    timestep = []

    for sim_file in sim_files:
        if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):
            monthly.append(prt_reader.read_monthly(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):
            hourly.append(prt_reader.read_hourly(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):
            timestep.append(prt_reader.read_step(sim_file))
        else:
            logger.warning("Unknown file type: %s", sim_file.name)

    monthly_df = (
        handle_duplicate_columns(_pd.concat(monthly, axis=1))
        if monthly
        else _pd.DataFrame()
    )
    hourly_df = (
        handle_duplicate_columns(_pd.concat(hourly, axis=1))
        if hourly
        else _pd.DataFrame()
    )
    timestep_df = (
        handle_duplicate_columns(_pd.concat(timestep, axis=1))
        if timestep
        else _pd.DataFrame()
    )
    return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)


def process_sim_using_file_content_prt(
    sim_folder: _pl.Path,
) -> Simulation:
    sim_files = utils.get_files([sim_folder])
    prt_reader = readers.PrtReader()
    hourly = []
    monthly = []
    step = []

    for sim_file in sim_files:
        file_type = fm.get_file_type_using_file_content(sim_file)
        if file_type == fm.FileType.MONTHLY:
            monthly.append(prt_reader.read_monthly(sim_file))
        elif file_type == fm.FileType.HOURLY:
            hourly.append(prt_reader.read_hourly(sim_file))
        elif file_type == fm.FileType.TIMESTEP:
            step.append(prt_reader.read_step(sim_file))
        else:
            logger.warning("Unknown file type: %s", sim_file.name)

    monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))
    hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))
    timestep_df = handle_duplicate_columns(_pd.concat(step, axis=1))
    return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)


def process_sim_csv(
    sim_folder: _pl.Path,
) -> Simulation:
    sim_files = utils.get_files([sim_folder], results_folder_name="converted")
    csv_reader = readers.CsvReader()
    hourly = []
    monthly = []
    timestep = []

    for sim_file in sim_files:
        if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):
            monthly.append(csv_reader.read_csv(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):
            hourly.append(csv_reader.read_csv(sim_file))
        elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):
            timestep.append(csv_reader.read_csv(sim_file))
        else:
            logger.warning("Unknown file type: %s", sim_file.name)

    monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))
    hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))
    timestep_df = handle_duplicate_columns(_pd.concat(timestep, axis=1))

    return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)

1	import pathlib as _pl	1✔
2	from dataclasses import dataclass	1✔
3
4	import pandas as _pd	1✔
5
6	from pytrnsys_process import file_matcher as fm	1✔
7	from pytrnsys_process import readers, utils	1✔
8	from pytrnsys_process.logger import logger	1✔
9
10
11	# TODO test if overlapping colums are allowed if the value are the same # pylint: disable=fixme
12
13
14	@dataclass	1✔
15	class Simulation:	1✔
16	path: _pl.Path	1✔
17	monthly: _pd.DataFrame	1✔
18	hourly: _pd.DataFrame	1✔
19	timestep: _pd.DataFrame	1✔
20	# TODO: Add results data here. Not sure yet, what this will look like # pylint: disable=fixme
21
22
23	def handle_duplicate_columns(df: _pd.DataFrame) -> _pd.DataFrame:	1✔
24	"""
25	Process duplicate columns in a DataFrame, ensuring they contain consistent data.
26
27	This function checks for duplicate column names and verifies that:
28	1. If one duplicate column has NaN values, the other(s) must also have NaN at the same indices
29	2. All non-NaN values must be identical across duplicate columns
30
31	Parameters
32	----------
33	df : pandas.DataFrame
34	Input DataFrame to process
35
36	Returns
37	-------
38	pandas.DataFrame
39	DataFrame with duplicate columns removed, keeping only the first occurrence
40
41	Raises
42	------
43	ValueError
44	If duplicate columns have:
45	- NaN values in one column while having actual values in another at the same index
46	- Different non-NaN values at the same index
47
48	https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns
49	"""
50	for col in df.columns[df.columns.duplicated(keep=False)]:	1✔
51	duplicate_cols = df.iloc[:, df.columns == col]	1✔
52
53	nan_mask = duplicate_cols.isna()	1✔
54	value_mask = ~nan_mask	1✔
55	if ((nan_mask.sum(axis=1) > 0) & (value_mask.sum(axis=1) > 0)).any():	1✔
56	raise ValueError(	1✔
57	f"Column '{col}' has NaN values in one column while having actual values in another"
58	)
59
60	if not duplicate_cols.apply(lambda x: x.nunique() <= 1, axis=1).all():	1✔
61	raise ValueError(	1✔
62	f"Column '{col}' has conflicting values at same indices"
63	)
64
65	df = df.iloc[:, ~df.columns.duplicated()].copy()	1✔
66	return df	1✔
67
68
69	def process_sim_prt(	1✔
70	sim_folder: _pl.Path,
71	) -> Simulation:
72	sim_files = utils.get_files([sim_folder])	1✔
73	prt_reader = readers.PrtReader()	1✔
74	hourly = []	1✔
75	monthly = []	1✔
76	timestep = []	1✔
77
78	for sim_file in sim_files:	1✔
79	if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):	1✔
80	monthly.append(prt_reader.read_monthly(sim_file))	1✔
81	elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):	1✔
82	hourly.append(prt_reader.read_hourly(sim_file))	1✔
83	elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):	1✔
84	timestep.append(prt_reader.read_step(sim_file))	1✔
85	else:
86	logger.warning("Unknown file type: %s", sim_file.name)	×
87
88	monthly_df = (	1✔
89	handle_duplicate_columns(_pd.concat(monthly, axis=1))
90	if monthly
91	else _pd.DataFrame()
92	)
93	hourly_df = (	1✔
94	handle_duplicate_columns(_pd.concat(hourly, axis=1))
95	if hourly
96	else _pd.DataFrame()
97	)
98	timestep_df = (	1✔
99	handle_duplicate_columns(_pd.concat(timestep, axis=1))
100	if timestep
101	else _pd.DataFrame()
102	)
103	return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)	1✔
104
105
106	def process_sim_using_file_content_prt(	1✔
107	sim_folder: _pl.Path,
108	) -> Simulation:
UNCOV 109	sim_files = utils.get_files([sim_folder])	×
UNCOV 110	prt_reader = readers.PrtReader()	×
UNCOV 111	hourly = []	×
UNCOV 112	monthly = []	×
UNCOV 113	step = []	×
114
UNCOV 115	for sim_file in sim_files:	×
UNCOV 116	file_type = fm.get_file_type_using_file_content(sim_file)	×
UNCOV 117	if file_type == fm.FileType.MONTHLY:	×
UNCOV 118	monthly.append(prt_reader.read_monthly(sim_file))	×
UNCOV 119	elif file_type == fm.FileType.HOURLY:	×
UNCOV 120	hourly.append(prt_reader.read_hourly(sim_file))	×
UNCOV 121	elif file_type == fm.FileType.TIMESTEP:	×
UNCOV 122	step.append(prt_reader.read_step(sim_file))	×
123	else:
124	logger.warning("Unknown file type: %s", sim_file.name)	×
125
UNCOV 126	monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))	×
UNCOV 127	hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))	×
UNCOV 128	timestep_df = handle_duplicate_columns(_pd.concat(step, axis=1))	×
NEW 129	return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)	×
130
131
132	def process_sim_csv(	1✔
133	sim_folder: _pl.Path,
134	) -> Simulation:
135	sim_files = utils.get_files([sim_folder], results_folder_name="converted")	1✔
136	csv_reader = readers.CsvReader()	1✔
137	hourly = []	1✔
138	monthly = []	1✔
139	timestep = []	1✔
140
141	for sim_file in sim_files:	1✔
142	if fm.has_pattern(sim_file.name, fm.FileType.MONTHLY):	1✔
143	monthly.append(csv_reader.read_csv(sim_file))	1✔
144	elif fm.has_pattern(sim_file.name, fm.FileType.HOURLY):	1✔
145	hourly.append(csv_reader.read_csv(sim_file))	1✔
146	elif fm.has_pattern(sim_file.name, fm.FileType.TIMESTEP):	1✔
147	timestep.append(csv_reader.read_csv(sim_file))	1✔
148	else:
149	logger.warning("Unknown file type: %s", sim_file.name)	×
150
151	monthly_df = handle_duplicate_columns(_pd.concat(monthly, axis=1))	1✔
152	hourly_df = handle_duplicate_columns(_pd.concat(hourly, axis=1))	1✔
153	timestep_df = handle_duplicate_columns(_pd.concat(timestep, axis=1))	1✔
154
155	return Simulation(sim_folder, monthly_df, hourly_df, timestep_df)	1✔

SPF-OST / pytrnsys_process / 12050239394

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous