• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 2116451580

pending completion
2116451580

Pull #24

github

GitHub
Merge 96da2e4c6 into bf78f91b7
Pull Request #24: Add CMIP file structure, use pyessv controlled vocabularies, and major refactoring

234 of 1073 new or added lines in 35 files covered. (21.81%)

13 existing lines in 4 files now uncovered.

728 of 3217 relevant lines covered (22.63%)

0.68 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

15.24
/miranda/eccc/_summaries.py
1
######################################################################
2
# G. Rondeau-Genesse, Ouranos, 2019-09-27
3
#
4
# Description
5
#
6
# find_and_extract_dly finds all the CSV files of a ECCC daily weather station,
7
# then appends the data within a pandas Dataframe
8
#
9
# dly_to_netcdf takes that Dataframe and exports it to a netCDF. When possible,
10
# the variables are converted to be compatible with CF-Convention. For example,
11
# "Max Temp (°C)" is renamed "tasmax" and converted to °K.
12
#
13
#####################################################################
14
import json
3✔
15
import logging
3✔
16
from logging import config
3✔
17
from pathlib import Path
3✔
18
from typing import Generator, List, Tuple, Union
3✔
19

20
import numpy as np
3✔
21
import pandas as pd
3✔
22
import xarray as xr
3✔
23

24
from miranda.scripting import LOGGING_CONFIG
3✔
25

26
config.dictConfig(LOGGING_CONFIG)
3✔
27
__all__ = ["extract_daily_summaries", "daily_summaries_to_netcdf"]
3✔
28

29
eccc_metadata = json.load(
3✔
30
    open(Path(__file__).parent / "eccc_obs_summary_cf_attrs.json")
31
)["variable_entry"]
32

33

34
# Searches a location for the station data, then calls the needed scripts to read and assembles the data using pandas
35
def extract_daily_summaries(
3✔
36
    path_station: Union[Path, str], rm_flags: bool = False, file_suffix: str = ".csv"
37
) -> dict:
38
    """
39

40
    Parameters
41
    ----------
42
    path_station : Union[Path, str]
43
      PathLike or str to the station's folder containing the csv files.
44
    rm_flags : bool
45
      Removes the 'Flag' and 'Quality' columns of the ECCC files.
46
    file_suffix : str
47
      File suffixes used by the tabular data. Default: ".csv".
48
    Returns
49
    -------
50
    dict
51
      dict containing the station metadata, as well as the data stored within a pandas Dataframe.
52
    """
53

54
    # Find the CSV files
55
    if "*" not in file_suffix:
×
NEW
56
        file_suffix = f"*{file_suffix}"
×
57
    station_files = Path(path_station).rglob(file_suffix)
×
58

59
    # extract the .csv data
60
    station = _read_multiple_daily_summaries(station_files, rm_flags=rm_flags)
×
61

62
    return station
×
63

64

65
# Uses xarray to transform the 'station' from find_and_extract_dly into a CF-Convention netCDF file
66
def daily_summaries_to_netcdf(station: dict, path_output: Union[Path, str]) -> None:
3✔
67
    """
68

69
    Parameters
70
    ----------
71
    station : dict
72
      dict created by using find_and_extract_dly
73
    path_output: Union[Path, str]
74

75
    Returns
76
    -------
77
    None
78
    """
79
    # first, transform the Date/Time to a 'days since' format
80
    time = station["data"]["Date/Time"] - np.array(
×
81
        "1950-01-01T00:00", dtype="datetime64"
82
    )
83
    time = time.astype("timedelta64[s]").astype(float) / 86400
×
84

85
    # we use expand_dims twice to 'add' longitude and latitude dimensions to the station data
86
    logging.info(
×
87
        "Reading data for station {} (ID: {}) now.".format(
88
            station["name"], station["ID"]
89
        )
90
    )
91

92
    ds = None
×
93

94
    variables = eccc_metadata["variable_entry"]
×
95
    for var in variables.keys():
×
96
        original_field = variables[var]["original_field"]
×
97
        add_offset = variables[var]["add_offset"]
×
98
        scale_factor = variables[var]["scale_factor"]
×
99

100
        da = xr.DataArray(
×
101
            np.expand_dims(
102
                np.expand_dims(
103
                    station["data"][original_field] * scale_factor + add_offset, axis=1
104
                ),
105
                axis=2,
106
            ),
107
            [
108
                ("time", time),
109
                ("lat", [station["latitude"]]),
110
                ("lon", [station["longitude"]]),
111
            ],
112
        )
113

114
        da.name = var
×
115
        for field in [
×
116
            "standard_name",
117
            "long_name",
118
            "units",
119
            "grid_mapping",
120
            "comments",
121
            "frequency",
122
        ]:
123
            da.attrs[field] = variables[var][field]
×
124

125
        # for the first variable, we simply create a dataset from it
126
        if ds is None:
×
127
            ds = da.to_dataset()
×
128
        else:
129
            ds[var] = da
×
130

131
    # add attributes to lon, lat, time, elevation, and the grid
132
    # TODO: There is probably a better CF Convention for point-based data
133
    da = xr.DataArray(np.full(len(time), np.nan), [("time", time)])
×
134
    da.name = "regular_lon_lat"
×
135
    da.attrs["grid_mapping_name"] = "lonlat"
×
136
    ds["regular_lon_lat"] = da
×
137

138
    da = xr.DataArray(
×
139
        np.expand_dims(np.expand_dims(station["elevation"], axis=1), axis=2),
140
        [("lat", [station["latitude"]]), ("lon", [station["longitude"]])],
141
    )
142
    da.name = "elevation"
×
143
    da.attrs["standard_name"] = "elevation"
×
144
    da.attrs["long_name"] = "elevation"
×
145
    da.attrs["units"] = "m"
×
146
    da.attrs["axis"] = "Z"
×
147
    ds["elevation"] = da
×
148
    ds = ds.set_coords("elevation")
×
149

150
    ds.lon.attrs["standard_name"] = "longitude"
×
151
    ds.lon.attrs["long_name"] = "longitude"
×
152
    ds.lon.attrs["units"] = "degrees_east"
×
153
    ds.lon.attrs["axis"] = "X"
×
154

155
    ds.lat.attrs["standard_name"] = "latitude"
×
156
    ds.lat.attrs["long_name"] = "latitude"
×
157
    ds.lat.attrs["units"] = "degrees_north"
×
158
    ds.lat.attrs["axis"] = "Y"
×
159

160
    ds.time.attrs["standard_name"] = "time"
×
161
    ds.time.attrs["long_name"] = "time"
×
162
    ds.time.attrs["units"] = "days since 1950-01-01 00:00:00"
×
163
    ds.time.attrs["axis"] = "T"
×
164
    ds.time.attrs["calendar"] = "gregorian"
×
165

166
    # add global attributes
167
    ds.attrs["Station Name"] = station["name"]
×
168
    ds.attrs["Province"] = station["province"]
×
169
    ds.attrs["Climate Identifier"] = station["ID"]
×
170
    ds.attrs["WMO Identifier"] = station["WMO_ID"]
×
171
    ds.attrs["TC Identifier"] = station["TC_ID"]
×
172
    ds.attrs["Institution"] = "Environment and Climate Change Canada"
×
173

174
    # save the data
175
    output_file = Path(path_output).joinpath("{}.nc".format(ds.attrs["Station Name"]))
×
176
    ds.to_netcdf(output_file)
×
177

178

179
##########################################
180
# BELOW THIS POINT ARE UTILITY SCRIPTS
181
##########################################
182

183

184
# This calls _read_single_eccc_dly and appends the data in a single Dict
185
def _read_multiple_daily_summaries(
3✔
186
    files: Union[List[Union[str, Path]], Generator[Path, None, None]],
187
    rm_flags: bool = False,
188
) -> dict:
189
    """
190

191
    Parameters
192
    ----------
193
    files : List[Union[str, Path]]
194
      A list of all the files to append.
195
    rm_flags : bool
196
      Removes all the 'Flag' and 'Quality' columns of the ECCC files. Default: False.
197

198
    Returns
199
    -------
200
    dict
201
    """
202

203
    # Extract the data for each files
204
    station_meta = None
×
205
    datafull = None
×
206

NEW
207
    file_list = [Path(f) for f in files]
×
NEW
208
    file_list.sort()
×
209

NEW
210
    for f in file_list:
×
211
        station_meta, data = _read_single_daily_summaries(f)
×
212
        if datafull is None:
×
213
            datafull = data
×
214
        else:
215
            datafull = datafull.append(data, ignore_index=True)
×
216

217
    # change the Date/Time column to a datetime64 type
218
    datafull["Date/Time"] = pd.to_datetime(datafull["Date/Time"])
×
219

220
    # if wanted, remove the quality and flag columns
221
    if rm_flags:
×
222
        index_quality = [
×
223
            i for i, s in enumerate(datafull.columns.values) if "Quality" in s
224
        ]
225
        datafull = datafull.drop(datafull.columns.values[index_quality], axis="columns")
×
226
        index_flag = [i for i, s in enumerate(datafull.columns.values) if "Flag" in s]
×
227
        datafull = datafull.drop(datafull.columns.values[index_flag], axis="columns")
×
228

229
    # combine everything in a single Dict
230
    station = station_meta
×
231
    station["data"] = datafull
×
232

233
    return station
×
234

235

236
# This is the script that actually reads the CSV files.
237
# The metadata are saved in a Dict, while the data is returned as a pandas Dataframe.
238
# FIXME: Climate Services Canada has changed the way they store metadata -- No longer in CSV heading
239
def _read_single_daily_summaries(file: Union[Path, str]) -> Tuple[dict, pd.DataFrame]:
3✔
240
    """
241

242
    Parameters
243
    ----------
244
    file : Union[Path, str]
245

246
    Returns
247
    -------
248
    Tuple[dict, pd.DataFrame]
249
    """
250
    # Read the whole file
NEW
251
    with open(file, encoding="utf-8-sig") as fi:
×
252
        lines = fi.readlines()
×
253

254
    # Find each elements in the header
255
    search_header = [0] * 9
×
256
    search_header[0] = [i for i, s in enumerate(lines) if "Station Name" in s][0]
×
257
    search_header[1] = [i for i, s in enumerate(lines) if "Province" in s][0]
×
258
    search_header[2] = [i for i, s in enumerate(lines) if "Latitude" in s][0]
×
259
    search_header[3] = [i for i, s in enumerate(lines) if "Longitude" in s][0]
×
260
    search_header[4] = [i for i, s in enumerate(lines) if "Elevation" in s][0]
×
261
    search_header[5] = [i for i, s in enumerate(lines) if "Climate Identifier" in s][0]
×
262
    search_header[6] = [i for i, s in enumerate(lines) if "WMO Identifier" in s][0]
×
263
    search_header[7] = [i for i, s in enumerate(lines) if "TC Identifier" in s][0]
×
264
    search_header[8] = [i for i, s in enumerate(lines) if "Date/Time" in s][
×
265
        0
266
    ]  # This is where the data actually starts
267

268
    # Does a bunch of stuff, but basically finds the right line, then cleans up the string
269
    station_meta = {
×
270
        "name": lines[search_header[0]]
271
        .split(",")[1]
272
        .replace('"', "")
273
        .replace("\n", ""),
274
        "province": lines[search_header[1]]
275
        .split(",")[1]
276
        .replace('"', "")
277
        .replace("\n", ""),
278
        "latitude": float(
279
            lines[search_header[2]].split(",")[1].replace('"', "").replace("\n", "")
280
        ),
281
        "longitude": float(
282
            lines[search_header[3]].split(",")[1].replace('"', "").replace("\n", "")
283
        ),
284
        "elevation": float(
285
            lines[search_header[4]].split(",")[1].replace('"', "").replace("\n", "")
286
        ),
287
        "ID": lines[search_header[5]].split(",")[1].replace('"', "").replace("\n", ""),
288
        "WMO_ID": lines[search_header[6]]
289
        .split(",")[1]
290
        .replace('"', "")
291
        .replace("\n", ""),
292
        "TC_ID": lines[search_header[7]]
293
        .split(",")[1]
294
        .replace('"', "")
295
        .replace("\n", ""),
296
    }
297

298
    data = pd.read_csv(file, header=search_header[8] - 2)
×
299
    # Makes sure that the data starts on Jan 1st
300
    if data.values[0, 2] != 1 | data.values[0, 3] != 1:
×
301
        logging.warning(
×
302
            "Data for file {} is not starting on January 1st. Make sure this is what you want!".format(
303
                file.name
304
            )
305
        )
306

307
    return station_meta, data
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2024 Coveralls, Inc