2116451580

Build Type

github

Committed by GitHub

Commit Message

Merge 96da2e4c6 into bf78f91b7

Pull Request Pull Request #24: Add CMIP file structure, use pyessv controlled vocabularies, and major refactoring

Run Details

234 of 1073 new or added lines in 35 files covered. (21.81%)

13 existing lines in 4 files now uncovered.

728 of 3217 relevant lines covered (22.63%)

0.68 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

15.24

/miranda/eccc/_summaries.py

######################################################################
# G. Rondeau-Genesse, Ouranos, 2019-09-27
#
# Description
#
# find_and_extract_dly finds all the CSV files of a ECCC daily weather station,
# then appends the data within a pandas Dataframe
#
# dly_to_netcdf takes that Dataframe and exports it to a netCDF. When possible,
# the variables are converted to be compatible with CF-Convention. For example,
# "Max Temp (°C)" is renamed "tasmax" and converted to °K.
#
#####################################################################
import json
import logging
from logging import config
from pathlib import Path
from typing import Generator, List, Tuple, Union

import numpy as np
import pandas as pd
import xarray as xr

from miranda.scripting import LOGGING_CONFIG

config.dictConfig(LOGGING_CONFIG)
__all__ = ["extract_daily_summaries", "daily_summaries_to_netcdf"]

eccc_metadata = json.load(
    open(Path(__file__).parent / "eccc_obs_summary_cf_attrs.json")
)["variable_entry"]


# Searches a location for the station data, then calls the needed scripts to read and assembles the data using pandas
def extract_daily_summaries(
    path_station: Union[Path, str], rm_flags: bool = False, file_suffix: str = ".csv"
) -> dict:
    """

    Parameters
    ----------
    path_station : Union[Path, str]
      PathLike or str to the station's folder containing the csv files.
    rm_flags : bool
      Removes the 'Flag' and 'Quality' columns of the ECCC files.
    file_suffix : str
      File suffixes used by the tabular data. Default: ".csv".
    Returns
    -------
    dict
      dict containing the station metadata, as well as the data stored within a pandas Dataframe.
    """

    # Find the CSV files
    if "*" not in file_suffix:
        file_suffix = f"*{file_suffix}"
    station_files = Path(path_station).rglob(file_suffix)

    # extract the .csv data
    station = _read_multiple_daily_summaries(station_files, rm_flags=rm_flags)

    return station


# Uses xarray to transform the 'station' from find_and_extract_dly into a CF-Convention netCDF file
def daily_summaries_to_netcdf(station: dict, path_output: Union[Path, str]) -> None:
    """

    Parameters
    ----------
    station : dict
      dict created by using find_and_extract_dly
    path_output: Union[Path, str]

    Returns
    -------
    None
    """
    # first, transform the Date/Time to a 'days since' format
    time = station["data"]["Date/Time"] - np.array(
        "1950-01-01T00:00", dtype="datetime64"
    )
    time = time.astype("timedelta64[s]").astype(float) / 86400

    # we use expand_dims twice to 'add' longitude and latitude dimensions to the station data
    logging.info(
        "Reading data for station {} (ID: {}) now.".format(
            station["name"], station["ID"]
        )
    )

    ds = None

    variables = eccc_metadata["variable_entry"]
    for var in variables.keys():
        original_field = variables[var]["original_field"]
        add_offset = variables[var]["add_offset"]
        scale_factor = variables[var]["scale_factor"]

        da = xr.DataArray(
            np.expand_dims(
                np.expand_dims(
                    station["data"][original_field] * scale_factor + add_offset, axis=1
                ),
                axis=2,
            ),
            [
                ("time", time),
                ("lat", [station["latitude"]]),
                ("lon", [station["longitude"]]),
            ],
        )

        da.name = var
        for field in [
            "standard_name",
            "long_name",
            "units",
            "grid_mapping",
            "comments",
            "frequency",
        ]:
            da.attrs[field] = variables[var][field]

        # for the first variable, we simply create a dataset from it
        if ds is None:
            ds = da.to_dataset()
        else:
            ds[var] = da

    # add attributes to lon, lat, time, elevation, and the grid
    # TODO: There is probably a better CF Convention for point-based data
    da = xr.DataArray(np.full(len(time), np.nan), [("time", time)])
    da.name = "regular_lon_lat"
    da.attrs["grid_mapping_name"] = "lonlat"
    ds["regular_lon_lat"] = da

    da = xr.DataArray(
        np.expand_dims(np.expand_dims(station["elevation"], axis=1), axis=2),
        [("lat", [station["latitude"]]), ("lon", [station["longitude"]])],
    )
    da.name = "elevation"
    da.attrs["standard_name"] = "elevation"
    da.attrs["long_name"] = "elevation"
    da.attrs["units"] = "m"
    da.attrs["axis"] = "Z"
    ds["elevation"] = da
    ds = ds.set_coords("elevation")

    ds.lon.attrs["standard_name"] = "longitude"
    ds.lon.attrs["long_name"] = "longitude"
    ds.lon.attrs["units"] = "degrees_east"
    ds.lon.attrs["axis"] = "X"

    ds.lat.attrs["standard_name"] = "latitude"
    ds.lat.attrs["long_name"] = "latitude"
    ds.lat.attrs["units"] = "degrees_north"
    ds.lat.attrs["axis"] = "Y"

    ds.time.attrs["standard_name"] = "time"
    ds.time.attrs["long_name"] = "time"
    ds.time.attrs["units"] = "days since 1950-01-01 00:00:00"
    ds.time.attrs["axis"] = "T"
    ds.time.attrs["calendar"] = "gregorian"

    # add global attributes
    ds.attrs["Station Name"] = station["name"]
    ds.attrs["Province"] = station["province"]
    ds.attrs["Climate Identifier"] = station["ID"]
    ds.attrs["WMO Identifier"] = station["WMO_ID"]
    ds.attrs["TC Identifier"] = station["TC_ID"]
    ds.attrs["Institution"] = "Environment and Climate Change Canada"

    # save the data
    output_file = Path(path_output).joinpath("{}.nc".format(ds.attrs["Station Name"]))
    ds.to_netcdf(output_file)


##########################################
# BELOW THIS POINT ARE UTILITY SCRIPTS
##########################################


# This calls _read_single_eccc_dly and appends the data in a single Dict
def _read_multiple_daily_summaries(
    files: Union[List[Union[str, Path]], Generator[Path, None, None]],
    rm_flags: bool = False,
) -> dict:
    """

    Parameters
    ----------
    files : List[Union[str, Path]]
      A list of all the files to append.
    rm_flags : bool
      Removes all the 'Flag' and 'Quality' columns of the ECCC files. Default: False.

    Returns
    -------
    dict
    """

    # Extract the data for each files
    station_meta = None
    datafull = None

    file_list = [Path(f) for f in files]
    file_list.sort()

    for f in file_list:
        station_meta, data = _read_single_daily_summaries(f)
        if datafull is None:
            datafull = data
        else:
            datafull = datafull.append(data, ignore_index=True)

    # change the Date/Time column to a datetime64 type
    datafull["Date/Time"] = pd.to_datetime(datafull["Date/Time"])

    # if wanted, remove the quality and flag columns
    if rm_flags:
        index_quality = [
            i for i, s in enumerate(datafull.columns.values) if "Quality" in s
        ]
        datafull = datafull.drop(datafull.columns.values[index_quality], axis="columns")
        index_flag = [i for i, s in enumerate(datafull.columns.values) if "Flag" in s]
        datafull = datafull.drop(datafull.columns.values[index_flag], axis="columns")

    # combine everything in a single Dict
    station = station_meta
    station["data"] = datafull

    return station


# This is the script that actually reads the CSV files.
# The metadata are saved in a Dict, while the data is returned as a pandas Dataframe.
# FIXME: Climate Services Canada has changed the way they store metadata -- No longer in CSV heading
def _read_single_daily_summaries(file: Union[Path, str]) -> Tuple[dict, pd.DataFrame]:
    """

    Parameters
    ----------
    file : Union[Path, str]

    Returns
    -------
    Tuple[dict, pd.DataFrame]
    """
    # Read the whole file
    with open(file, encoding="utf-8-sig") as fi:
        lines = fi.readlines()

    # Find each elements in the header
    search_header = [0] * 9
    search_header[0] = [i for i, s in enumerate(lines) if "Station Name" in s][0]
    search_header[1] = [i for i, s in enumerate(lines) if "Province" in s][0]
    search_header[2] = [i for i, s in enumerate(lines) if "Latitude" in s][0]
    search_header[3] = [i for i, s in enumerate(lines) if "Longitude" in s][0]
    search_header[4] = [i for i, s in enumerate(lines) if "Elevation" in s][0]
    search_header[5] = [i for i, s in enumerate(lines) if "Climate Identifier" in s][0]
    search_header[6] = [i for i, s in enumerate(lines) if "WMO Identifier" in s][0]
    search_header[7] = [i for i, s in enumerate(lines) if "TC Identifier" in s][0]
    search_header[8] = [i for i, s in enumerate(lines) if "Date/Time" in s][
        0
    ]  # This is where the data actually starts

    # Does a bunch of stuff, but basically finds the right line, then cleans up the string
    station_meta = {
        "name": lines[search_header[0]]
        .split(",")[1]
        .replace('"', "")
        .replace("\n", ""),
        "province": lines[search_header[1]]
        .split(",")[1]
        .replace('"', "")
        .replace("\n", ""),
        "latitude": float(
            lines[search_header[2]].split(",")[1].replace('"', "").replace("\n", "")
        ),
        "longitude": float(
            lines[search_header[3]].split(",")[1].replace('"', "").replace("\n", "")
        ),
        "elevation": float(
            lines[search_header[4]].split(",")[1].replace('"', "").replace("\n", "")
        ),
        "ID": lines[search_header[5]].split(",")[1].replace('"', "").replace("\n", ""),
        "WMO_ID": lines[search_header[6]]
        .split(",")[1]
        .replace('"', "")
        .replace("\n", ""),
        "TC_ID": lines[search_header[7]]
        .split(",")[1]
        .replace('"', "")
        .replace("\n", ""),
    }

    data = pd.read_csv(file, header=search_header[8] - 2)
    # Makes sure that the data starts on Jan 1st
    if data.values[0, 2] != 1 | data.values[0, 3] != 1:
        logging.warning(
            "Data for file {} is not starting on January 1st. Make sure this is what you want!".format(
                file.name
            )
        )

    return station_meta, data

1	######################################################################
2	# G. Rondeau-Genesse, Ouranos, 2019-09-27
3	#
4	# Description
5	#
6	# find_and_extract_dly finds all the CSV files of a ECCC daily weather station,
7	# then appends the data within a pandas Dataframe
8	#
9	# dly_to_netcdf takes that Dataframe and exports it to a netCDF. When possible,
10	# the variables are converted to be compatible with CF-Convention. For example,
11	# "Max Temp (°C)" is renamed "tasmax" and converted to °K.
12	#
13	#####################################################################
14	import json	3✔
15	import logging	3✔
16	from logging import config	3✔
17	from pathlib import Path	3✔
18	from typing import Generator, List, Tuple, Union	3✔
19
20	import numpy as np	3✔
21	import pandas as pd	3✔
22	import xarray as xr	3✔
23
24	from miranda.scripting import LOGGING_CONFIG	3✔
25
26	config.dictConfig(LOGGING_CONFIG)	3✔
27	__all__ = ["extract_daily_summaries", "daily_summaries_to_netcdf"]	3✔
28
29	eccc_metadata = json.load(	3✔
30	open(Path(__file__).parent / "eccc_obs_summary_cf_attrs.json")
31	)["variable_entry"]
32
33
34	# Searches a location for the station data, then calls the needed scripts to read and assembles the data using pandas
35	def extract_daily_summaries(	3✔
36	path_station: Union[Path, str], rm_flags: bool = False, file_suffix: str = ".csv"
37	) -> dict:
38	"""
39
40	Parameters
41	----------
42	path_station : Union[Path, str]
43	PathLike or str to the station's folder containing the csv files.
44	rm_flags : bool
45	Removes the 'Flag' and 'Quality' columns of the ECCC files.
46	file_suffix : str
47	File suffixes used by the tabular data. Default: ".csv".
48	Returns
49	-------
50	dict
51	dict containing the station metadata, as well as the data stored within a pandas Dataframe.
52	"""
53
54	# Find the CSV files
55	if "*" not in file_suffix:	×
NEW 56	file_suffix = f"*{file_suffix}"	×
57	station_files = Path(path_station).rglob(file_suffix)	×
58
59	# extract the .csv data
60	station = _read_multiple_daily_summaries(station_files, rm_flags=rm_flags)	×
61
62	return station	×
63
64
65	# Uses xarray to transform the 'station' from find_and_extract_dly into a CF-Convention netCDF file
66	def daily_summaries_to_netcdf(station: dict, path_output: Union[Path, str]) -> None:	3✔
67	"""
68
69	Parameters
70	----------
71	station : dict
72	dict created by using find_and_extract_dly
73	path_output: Union[Path, str]
74
75	Returns
76	-------
77	None
78	"""
79	# first, transform the Date/Time to a 'days since' format
80	time = station["data"]["Date/Time"] - np.array(	×
81	"1950-01-01T00:00", dtype="datetime64"
82	)
83	time = time.astype("timedelta64[s]").astype(float) / 86400	×
84
85	# we use expand_dims twice to 'add' longitude and latitude dimensions to the station data
86	logging.info(	×
87	"Reading data for station {} (ID: {}) now.".format(
88	station["name"], station["ID"]
89	)
90	)
91
92	ds = None	×
93
94	variables = eccc_metadata["variable_entry"]	×
95	for var in variables.keys():	×
96	original_field = variables[var]["original_field"]	×
97	add_offset = variables[var]["add_offset"]	×
98	scale_factor = variables[var]["scale_factor"]	×
99
100	da = xr.DataArray(	×
101	np.expand_dims(
102	np.expand_dims(
103	station["data"][original_field] * scale_factor + add_offset, axis=1
104	),
105	axis=2,
106	),
107	[
108	("time", time),
109	("lat", [station["latitude"]]),
110	("lon", [station["longitude"]]),
111	],
112	)
113
114	da.name = var	×
115	for field in [	×
116	"standard_name",
117	"long_name",
118	"units",
119	"grid_mapping",
120	"comments",
121	"frequency",
122	]:
123	da.attrs[field] = variables[var][field]	×
124
125	# for the first variable, we simply create a dataset from it
126	if ds is None:	×
127	ds = da.to_dataset()	×
128	else:
129	ds[var] = da	×
130
131	# add attributes to lon, lat, time, elevation, and the grid
132	# TODO: There is probably a better CF Convention for point-based data
133	da = xr.DataArray(np.full(len(time), np.nan), [("time", time)])	×
134	da.name = "regular_lon_lat"	×
135	da.attrs["grid_mapping_name"] = "lonlat"	×
136	ds["regular_lon_lat"] = da	×
137
138	da = xr.DataArray(	×
139	np.expand_dims(np.expand_dims(station["elevation"], axis=1), axis=2),
140	[("lat", [station["latitude"]]), ("lon", [station["longitude"]])],
141	)
142	da.name = "elevation"	×
143	da.attrs["standard_name"] = "elevation"	×
144	da.attrs["long_name"] = "elevation"	×
145	da.attrs["units"] = "m"	×
146	da.attrs["axis"] = "Z"	×
147	ds["elevation"] = da	×
148	ds = ds.set_coords("elevation")	×
149
150	ds.lon.attrs["standard_name"] = "longitude"	×
151	ds.lon.attrs["long_name"] = "longitude"	×
152	ds.lon.attrs["units"] = "degrees_east"	×
153	ds.lon.attrs["axis"] = "X"	×
154
155	ds.lat.attrs["standard_name"] = "latitude"	×
156	ds.lat.attrs["long_name"] = "latitude"	×
157	ds.lat.attrs["units"] = "degrees_north"	×
158	ds.lat.attrs["axis"] = "Y"	×
159
160	ds.time.attrs["standard_name"] = "time"	×
161	ds.time.attrs["long_name"] = "time"	×
162	ds.time.attrs["units"] = "days since 1950-01-01 00:00:00"	×
163	ds.time.attrs["axis"] = "T"	×
164	ds.time.attrs["calendar"] = "gregorian"	×
165
166	# add global attributes
167	ds.attrs["Station Name"] = station["name"]	×
168	ds.attrs["Province"] = station["province"]	×
169	ds.attrs["Climate Identifier"] = station["ID"]	×
170	ds.attrs["WMO Identifier"] = station["WMO_ID"]	×
171	ds.attrs["TC Identifier"] = station["TC_ID"]	×
172	ds.attrs["Institution"] = "Environment and Climate Change Canada"	×
173
174	# save the data
175	output_file = Path(path_output).joinpath("{}.nc".format(ds.attrs["Station Name"]))	×
176	ds.to_netcdf(output_file)	×
177
178
179	##########################################
180	# BELOW THIS POINT ARE UTILITY SCRIPTS
181	##########################################
182
183
184	# This calls _read_single_eccc_dly and appends the data in a single Dict
185	def _read_multiple_daily_summaries(	3✔
186	files: Union[List[Union[str, Path]], Generator[Path, None, None]],
187	rm_flags: bool = False,
188	) -> dict:
189	"""
190
191	Parameters
192	----------
193	files : List[Union[str, Path]]
194	A list of all the files to append.
195	rm_flags : bool
196	Removes all the 'Flag' and 'Quality' columns of the ECCC files. Default: False.
197
198	Returns
199	-------
200	dict
201	"""
202
203	# Extract the data for each files
204	station_meta = None	×
205	datafull = None	×
206
NEW 207	file_list = [Path(f) for f in files]	×
NEW 208	file_list.sort()	×
209
NEW 210	for f in file_list:	×
211	station_meta, data = _read_single_daily_summaries(f)	×
212	if datafull is None:	×
213	datafull = data	×
214	else:
215	datafull = datafull.append(data, ignore_index=True)	×
216
217	# change the Date/Time column to a datetime64 type
218	datafull["Date/Time"] = pd.to_datetime(datafull["Date/Time"])	×
219
220	# if wanted, remove the quality and flag columns
221	if rm_flags:	×
222	index_quality = [	×
223	i for i, s in enumerate(datafull.columns.values) if "Quality" in s
224	]
225	datafull = datafull.drop(datafull.columns.values[index_quality], axis="columns")	×
226	index_flag = [i for i, s in enumerate(datafull.columns.values) if "Flag" in s]	×
227	datafull = datafull.drop(datafull.columns.values[index_flag], axis="columns")	×
228
229	# combine everything in a single Dict
230	station = station_meta	×
231	station["data"] = datafull	×
232
233	return station	×
234
235
236	# This is the script that actually reads the CSV files.
237	# The metadata are saved in a Dict, while the data is returned as a pandas Dataframe.
238	# FIXME: Climate Services Canada has changed the way they store metadata -- No longer in CSV heading
239	def _read_single_daily_summaries(file: Union[Path, str]) -> Tuple[dict, pd.DataFrame]:	3✔
240	"""
241
242	Parameters
243	----------
244	file : Union[Path, str]
245
246	Returns
247	-------
248	Tuple[dict, pd.DataFrame]
249	"""
250	# Read the whole file
NEW 251	with open(file, encoding="utf-8-sig") as fi:	×
252	lines = fi.readlines()	×
253
254	# Find each elements in the header
255	search_header = [0] * 9	×
256	search_header[0] = [i for i, s in enumerate(lines) if "Station Name" in s][0]	×
257	search_header[1] = [i for i, s in enumerate(lines) if "Province" in s][0]	×
258	search_header[2] = [i for i, s in enumerate(lines) if "Latitude" in s][0]	×
259	search_header[3] = [i for i, s in enumerate(lines) if "Longitude" in s][0]	×
260	search_header[4] = [i for i, s in enumerate(lines) if "Elevation" in s][0]	×
261	search_header[5] = [i for i, s in enumerate(lines) if "Climate Identifier" in s][0]	×
262	search_header[6] = [i for i, s in enumerate(lines) if "WMO Identifier" in s][0]	×
263	search_header[7] = [i for i, s in enumerate(lines) if "TC Identifier" in s][0]	×
264	search_header[8] = [i for i, s in enumerate(lines) if "Date/Time" in s][	×
265	0
266	] # This is where the data actually starts
267
268	# Does a bunch of stuff, but basically finds the right line, then cleans up the string
269	station_meta = {	×
270	"name": lines[search_header[0]]
271	.split(",")[1]
272	.replace('"', "")
273	.replace("\n", ""),
274	"province": lines[search_header[1]]
275	.split(",")[1]
276	.replace('"', "")
277	.replace("\n", ""),
278	"latitude": float(
279	lines[search_header[2]].split(",")[1].replace('"', "").replace("\n", "")
280	),
281	"longitude": float(
282	lines[search_header[3]].split(",")[1].replace('"', "").replace("\n", "")
283	),
284	"elevation": float(
285	lines[search_header[4]].split(",")[1].replace('"', "").replace("\n", "")
286	),
287	"ID": lines[search_header[5]].split(",")[1].replace('"', "").replace("\n", ""),
288	"WMO_ID": lines[search_header[6]]
289	.split(",")[1]
290	.replace('"', "")
291	.replace("\n", ""),
292	"TC_ID": lines[search_header[7]]
293	.split(",")[1]
294	.replace('"', "")
295	.replace("\n", ""),
296	}
297
298	data = pd.read_csv(file, header=search_header[8] - 2)	×
299	# Makes sure that the data starts on Jan 1st
300	if data.values[0, 2] != 1 \| data.values[0, 3] != 1:	×
301	logging.warning(	×
302	"Data for file {} is not starting on January 1st. Make sure this is what you want!".format(
303	file.name
304	)
305	)
306
307	return station_meta, data	×

Ouranosinc / miranda / 2116451580

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous