• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 8347055167

19 Mar 2024 04:55PM UTC coverage: 19.019% (+1.7%) from 17.336%
8347055167

Pull #165

github

web-flow
Merge 42b957dc6 into 210fad570
Pull Request #165: Refactor ECCC functionality and create Preprocess module

252 of 1297 new or added lines in 30 files covered. (19.43%)

2 existing lines in 2 files now uncovered.

938 of 4932 relevant lines covered (19.02%)

0.76 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.65
/miranda/decode/_decoder.py
1
from __future__ import annotations
4✔
2

3
import logging
4✔
4
import multiprocessing as mp
4✔
5
import os
4✔
6
import re
4✔
7
import warnings
4✔
8
from functools import partial
4✔
9
from logging import config
4✔
10
from os import PathLike
4✔
11
from pathlib import Path
4✔
12
from types import GeneratorType
4✔
13

14
import netCDF4 as nc  # noqa
4✔
15
import pandas as pd
4✔
16
import schema
4✔
17
import xarray as xr
4✔
18
import zarr
4✔
19
from pandas._libs.tslibs import NaTType  # noqa
4✔
20

21
from miranda.convert.utils import date_parser, find_version_hash  # noqa
4✔
22
from miranda.scripting import LOGGING_CONFIG
4✔
23
from miranda.units import get_time_frequency
4✔
24
from miranda.vocabularies.esgf import VALIDATION_ENABLED
4✔
25

26
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
4✔
27

28
if VALIDATION_ENABLED:
4✔
29
    from miranda.validators import FACETS_SCHEMA  # noqa
×
NEW
30
    from miranda.vocabularies.esgf import INSTITUTIONS, PROJECT_MODELS
×
31

32

33
config.dictConfig(LOGGING_CONFIG)
4✔
34

35
__all__ = [
4✔
36
    "Decoder",
37
    "guess_project",
38
]
39

40

41
def guess_project(file: os.PathLike | str) -> str:
4✔
42
    """Guess the name of the project
43

44
    Parameters
45
    ----------
46
    file : str or os.PathLike
47

48
    Returns
49
    -------
50
    str
51
    """
52
    file_name = Path(file).stem
×
53

54
    potential_names = file_name.split("_")
×
55
    if VALIDATION_ENABLED:
×
56
        for project, models in PROJECT_MODELS.items():
×
57
            if any([model in potential_names for model in models]):
×
58
                return project
×
59
        raise DecoderError(
×
60
            f"Unable to determine project from file name: '{file_name}'."
61
        )
62
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
63

64

65
class Decoder:
4✔
66
    project = None
4✔
67
    guess = False
4✔
68
    _file_facets = dict()
4✔
69

70
    def __init__(self, project: str | None):
4✔
71
        self.project = project
×
72

73
    @staticmethod
4✔
74
    def _decoder(
4✔
75
        d: dict,
76
        fail_early: bool,
77
        proj: str,
78
        guess: bool,
79
        lock: mp.Lock,
80
        file: str | Path,
81
    ) -> None:
82
        if proj is None:
×
83
            if guess:
×
84
                try:
×
85
                    proj = guess_project(file)
×
86
                except DecoderError:
×
87
                    print(
×
88
                        "Unable to determine 'activity': Signature for 'activity' must be set manually for file: "
89
                        f"{file}."
90
                    )
91
                    if fail_early:
×
92
                        raise
×
93
            else:
94
                proj = "converted"
×
95

96
        decode_function_name = f"decode_{proj.lower().replace('-','_')}"
×
97
        try:
×
98
            with lock:
×
99
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
100
                if fail_early:
×
101
                    if VALIDATION_ENABLED:
×
102
                        FACETS_SCHEMA.validate(_deciphered)
×
103
                    else:
104
                        print(
×
105
                            "Validation requires pyessv-archive source files. Skipping validation checks."
106
                        )
107
                print(
×
108
                    f"Deciphered the following from {Path(file).name}:\n"
109
                    f"{_deciphered.items()}"
110
                )
111
                d[file] = _deciphered
×
112

113
        except (AttributeError, NotImplementedError):
×
114
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
115
            raise
×
116
        except schema.SchemaError as e:
×
117
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
118

119
    def decode(
4✔
120
        self,
121
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
122
        chunks: int | None = None,
123
        raise_error: bool = False,
124
    ) -> None:
125
        """Decode facets from file or list of files.
126

127
        Parameters
128
        ----------
129
        files : str or Path or list of str or Path or generator
130
            The files to decode.
131
        chunks : int, optional
132
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
133
        raise_error : bool
134
            Whether to raise an error if a file cannot be decoded.
135
        """
136
        if isinstance(files, (str, os.PathLike)):
×
137
            files = [files]
×
138

139
        if chunks is None and isinstance(files, list):
×
140
            if len(files) >= 10:
×
141
                chunk_size = 10
×
142
            elif 1 <= len(files) < 10:
×
143
                chunk_size = len(files)
×
144
            else:
145
                raise ValueError("No file entries found.")
×
146
        elif isinstance(files, GeneratorType):
×
147
            chunk_size = 10
×
148
        else:
149
            chunk_size = chunks
×
150

151
        if self.project is None:
×
152
            warnings.warn(
×
153
                "The decoder 'project' is not set; Decoding step will be much slower."
154
            )
155
        else:
156
            logging.info(f"Deciphering metadata with project = '{self.project}'")
×
157

158
        with mp.Manager() as manager:
×
159
            _file_facets = manager.dict()
×
160
            lock = manager.Lock()
×
161
            func = partial(
×
162
                self._decoder, _file_facets, raise_error, self.project, self.guess, lock
163
            )
164

165
            with mp.Pool() as pool:
×
166
                pool.imap(func, files, chunksize=chunk_size)
×
167
                pool.close()
×
168
                pool.join()
×
169

170
            self._file_facets.update(_file_facets)
×
171

172
    def facets_table(self):
4✔
173
        raise NotImplementedError()
×
174

175
    def file_facets(self) -> dict[os.PathLike, dict]:
4✔
176
        return self._file_facets
×
177

178
    @classmethod
4✔
179
    def _from_dataset(cls, file: Path | str) -> (str, str, dict):
4✔
180
        file_name = Path(file).stem
×
181

182
        try:
×
183
            variable_name = cls._decode_primary_variable(file)
×
184
        except DecoderError:
×
185
            logging.error(f"Unable to open dataset: {file.name}")
×
186
            raise
×
187

188
        datetimes = file_name.split("_")[-1]
×
189

190
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
191
            with nc.Dataset(file, mode="r") as ds:
×
192
                data = dict()
×
193
                for k in ds.ncattrs():
×
194
                    data[k] = getattr(ds, k)
×
195
        elif file.is_dir() and file.suffix == ".zarr":
×
196
            with zarr.open(file, mode="r") as ds:
×
197
                data = ds.attrs.asdict()
×
198
        else:
199
            raise DecoderError(f"Unable to read dataset: `{file.name}`.")
×
200
        return variable_name, datetimes, data
×
201

202
    @staticmethod
4✔
203
    def _decode_primary_variable(file: Path) -> str:
4✔
204
        """Attempts to find the primary variable of a netCDF
205

206
        Parameters
207
        ----------
208
        file: Path
209

210
        Returns
211
        -------
212
        str
213
        """
214
        dimsvar_dict = dict()
×
215
        coords = (
×
216
            "height",
217
            "lat",
218
            "latitude",
219
            "lev",
220
            "level",
221
            "lon",
222
            "longitude",
223
            "rlat",
224
            "rlon",
225
            "rotated_pole",
226
            "time",
227
        )
228
        try:
×
229
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
230
                with nc.Dataset(file, mode="r") as ds:
×
231
                    for var_name, var_attrs in ds.variables.items():
×
232
                        dimsvar_dict[var_name] = {
×
233
                            k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()
234
                        }
235
                for k in dimsvar_dict.keys():
×
236
                    if not str(k).startswith(coords) and k in file.stem:
×
237
                        return str(k)
×
238

239
            elif file.is_dir() and file.suffix == ".zarr":
×
240
                with zarr.open(str(file), mode="r") as ds:
×
241
                    for k in ds.array_keys():
×
242
                        if not str(k).startswith(coords) and k in file.stem:
×
243
                            return str(k)
×
244
            else:
245
                raise NotImplementedError()
×
246
        except ValueError:
×
247
            raise DecoderError()
×
248

249
    @staticmethod
4✔
250
    def _decode_hour_of_day_info(
4✔
251
        file: PathLike | str,
252
    ) -> dict:
253
        """Decode hour of day information.
254

255
        Parameters
256
        ----------
257
        file : Path or str
258

259
        Returns
260
        -------
261
        dict
262
        """
263
        if isinstance(file, str):
×
264
            file = Path(file)
×
265

266
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
267
            with nc.Dataset(file, mode="r") as ds:
×
268
                if "time" in ds.variables.keys():
×
269
                    hour = nc.num2date(
×
270
                        ds["time"][0], ds["time"].units, ds["time"].calendar
271
                    ).hour
272
                else:
273
                    hour = None
×
274
            return dict(hour_of_day=hour)
×
275

276
        elif file.is_dir() and file.suffix == ".zarr":
×
277
            warnings.warn("This is not currently implemented")
×
278

279
            # with zarr.open(str(file), mode="r") as ds:
280
            #     if "time" in ds.array_keys():
281
            #         pass
282

283
            return dict()
×
284

285
        else:
286
            raise NotImplementedError()
×
287

288
    @staticmethod
4✔
289
    def _decode_time_info(  # noqa: C901
4✔
290
        file: PathLike | str | list[str] | None = None,
291
        data: dict | None = None,
292
        term: str | None = None,
293
        *,
294
        field: str = None,
295
    ) -> str | NaTType:
296
        """Decode time information.
297

298
        Parameters
299
        ----------
300
        file : os.PathLike or str, optional
301
        data : dict, optional
302
        term : str
303
        field : {"timedelta", "frequency"}
304

305
        Returns
306
        -------
307
        str or NaTType
308
        """
309
        if not file and not data and not term:
×
310
            raise ValueError("Nothing passed to parse time info from.")
×
311

312
        if field == "frequency":
×
313
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
314
        elif field == "timedelta":
×
315
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
316
        else:
317
            raise NotImplementedError()
×
318

319
        if term:
×
320
            if term in ["fx", "fixed"]:
×
321
                if field == "timedelta":
×
322
                    return pd.NaT
×
323
                return "fx"
×
324
            return pd.to_timedelta(time_dictionary[term])
×
325

326
        if data and not file:
×
327
            potential_time = data.get("frequency")
×
328
            if not potential_time:
×
329
                if hasattr(data, "time"):
×
330
                    time_units = data["time"].units
×
331
                    potential_time = time_units.split()[0]
×
332
                else:
333
                    logging.warning(
×
334
                        f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
335
                    )
336
                    potential_time = "fx"
×
337
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
338
                logging.warning(f"Found `{potential_time}`. Frequency is likely `fx`.")
×
339
                if field == "frequency":
×
340
                    return "fx"
×
341
                if field == "timedelta":
×
342
                    return pd.NaT
×
343
                raise ValueError()
×
344

345
            if field == "timedelta":
×
346
                if potential_time in ["fx", "fixed"]:
×
347
                    return pd.NaT
×
348
                return pd.to_timedelta(time_dictionary[potential_time])
×
349
            return time_dictionary[potential_time]
×
350

351
        if file and not data:
×
352
            for delimiter in ["_", "."]:
×
353
                file_parts = Path(file).stem.split(delimiter)
×
354
                potential_times = [
×
355
                    segment
356
                    for segment in file_parts
357
                    if segment in time_dictionary.keys()
358
                ]
359
                if potential_times:
×
360
                    if potential_times[0] in ["fx", "fixed"]:
×
361
                        if field == "frequency":
×
362
                            return "fx"
×
363
                        if field == "timedelta":
×
364
                            return pd.NaT
×
365
                        raise ValueError(f"Field `{field}` not supported.")
×
366
                    if field == "timedelta":
×
367
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
368
                    return time_dictionary[potential_times[0]]
×
369

370
        if file and data:
×
371
            for delimiter in ["_", "."]:
×
372
                file_parts = Path(file).stem.split(delimiter)
×
373
                potential_times = [
×
374
                    segment
375
                    for segment in file_parts
376
                    if segment in time_dictionary.keys()
377
                ]
378
                potential_time = data.get("frequency", "")
×
379
                if potential_time == "":
×
380
                    if hasattr(data, "time"):
×
381
                        time_units = data["time"].units
×
382
                        potential_time = time_units.split()[0]
×
383
                    else:
384
                        logging.warning(
×
385
                            f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
386
                        )
387
                        potential_time = "fx"
×
388
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
389
                    logging.warning(
×
390
                        f"Found `{potential_time}`. Frequency is likely `fx`."
391
                    )
392
                    if "fx" in file_parts or "fixed" in file_parts:
×
393
                        if field == "frequency":
×
394
                            return "fx"
×
395
                        if field == "timedelta":
×
396
                            return pd.NaT
×
397
                        raise ValueError(f"Field `{field}` not supported.")
×
398

399
                if potential_time in potential_times:
×
400
                    return time_dictionary[potential_time]
×
401
                elif potential_times:
×
402
                    break
×
403

404
            logging.warning(
×
405
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
406
                "Performing more rigorous frequency checks."
407
            )
408
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
409
                engine = "netcdf4"
×
410
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
411
                engine = "zarr"
×
412
            else:
413
                raise DecoderError(
×
414
                    f"File is not valid netcdf or zarr: {Path(file).name}"
415
                )
416

417
            _ds = xr.open_dataset(
×
418
                file,
419
                engine=engine,
420
                drop_variables="time_bnds",
421
            )
422
            if not hasattr(_ds, "time"):
×
423
                logging.warning(
×
424
                    "Dataset does not contain time array. Assuming fixed variable."
425
                )
426
                if field == "frequency":
×
427
                    return "fx"
×
428
                if field == "timedelta":
×
429
                    return pd.NaT
×
430
                raise ValueError(f"Field `{field}` not supported.")
×
431
            else:
432
                _, found_freq = get_time_frequency(_ds.time)
×
433

434
            if found_freq in potential_times:
×
435
                logging.warning(
×
436
                    "Time frequency found in dataset on analysis was found in filename. "
437
                    f"Metadata for `{Path(file).name} is probably incorrect. "
438
                    f"Basing fields on `{found_freq}`."
439
                )
440
                return time_dictionary[found_freq]
×
441
            elif found_freq in ["month", "mon"]:
×
442
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
443
                    if f in potential_times:
×
444
                        logging.warning(
×
445
                            "Month-like time frequency found in dataset on analysis was found in filename. "
446
                            f"Basing fields on `{f}`."
447
                        )
448
                        return time_dictionary[f]
×
449
            else:
450
                logging.warning(
×
451
                    "Time frequency found in dataset on analysis was not found in filename. "
452
                    f"Basing fields on `{found_freq}`."
453
                )
454
                return time_dictionary[found_freq]
×
455
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
456

457
    @staticmethod
4✔
458
    def _decode_version(file: PathLike | str, data: dict) -> dict:
4✔
459
        """Decode version information.
460

461
        Parameters
462
        ----------
463
        file : os.PathLike or str
464
        data : dict
465

466
        Returns
467
        -------
468
        dict
469
        """
470
        version_info = dict()
×
471
        try:
×
472
            version_info["version"] = data["version"]
×
473
        except KeyError:
×
474
            possible_version = Path(file).parent
×
475
            if re.match(r"^[vV]\d+", possible_version.name):
×
476
                version_info["version"] = possible_version.name
×
477
            else:
478
                possible_version_signature = possible_version.glob(
×
479
                    f"{Path(file).stem}.v*"
480
                )
481
                for sig in possible_version_signature:
×
482
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
483
                    if found_version:
×
484
                        version_info["version"] = found_version.group()
×
485
                        version_info["sha256sum"] = sig.open().read()
×
486
                        break
×
487
                else:
488
                    version_info["version"] = "vNotFound"
×
489
        return version_info
×
490

491
    @classmethod
4✔
492
    def decode_converted(cls, file: PathLike | str) -> dict:
4✔
493
        """Decode converted data.
494

495
        Parameters
496
        ----------
497
        file : os.PathLike or str
498

499
        Returns
500
        -------
501
        dict
502
        """
503
        facets = dict()
×
504
        try:
×
505
            variable, date, data = cls._from_dataset(file=file)
×
506
        except DecoderError:
×
507
            return facets
×
508

509
        facets.update(data)
×
510
        del facets["history"]
×
511

512
        facets["date"] = date
×
513

514
        file_format = data.get("output_format")
×
515
        if file_format:
×
516
            facets["format"] = file_format
×
517
        elif "format" in data:
×
518
            facets["format"] = data["format"]
×
519
        elif Path(file).suffix in [".nc", ".nc4"]:
×
520
            facets["format"] = "nc"
×
521
        elif Path(file).suffix in [".zarr"]:
×
522
            facets["format"] = "zarr"
×
523
        facets["variable"] = variable
×
524

525
        facets.update(cls._decode_version(data=data, file=file))
×
526
        facets.update(cls._decode_hour_of_day_info(file=file))
×
527

528
        try:
×
529
            if "frequency" not in facets:
×
530
                facets["timedelta"] = cls._decode_time_info(
×
531
                    data=data, file=file, field="frequency"
532
                )
533
            facets["timedelta"] = cls._decode_time_info(
×
534
                term=facets["frequency"], field="timedelta"
535
            )
536
            facets["date_start"] = date_parser(date)
×
537
            facets["date_end"] = date_parser(date, end_of_period=True)
×
538
        except DecoderError:
×
539
            pass
×
540

541
        return facets
×
542

543
    @staticmethod
4✔
544
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
4✔
545
        raise NotImplementedError()
×
546

547
    @staticmethod
4✔
548
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
4✔
549
        raise NotImplementedError()
×
550

551
    @staticmethod
4✔
552
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
4✔
553
        raise NotImplementedError()
×
554

555
    @classmethod
4✔
556
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
4✔
557
        if "Derived" in Path(file).parents:
×
558
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
559

560
        facets = dict()
×
561
        try:
×
562
            variable, date, data = cls._from_dataset(file=file)
×
563
        except DecoderError:
×
564
            return facets
×
565

566
        facets["activity"] = data["activity_id"]
×
567
        facets["mip_era"] = data["project_id"]
×
568
        facets["bias_adjust_institution"] = "PCIC"
×
569
        facets["date"] = date
×
570
        facets["domain"] = data["domain"]
×
571
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
572
        facets["format"] = "netcdf"
×
573
        facets["institution"] = data["GCM__institution_id"]
×
574
        facets["member"] = (
×
575
            f"r{data['GCM__realization_index']}"
576
            f"i{data['GCM__initialization_index']}"
577
            f"p{data['GCM__physics_index']}"
578
            f"f{data['GCM__forcing_index']}"
579
        )
580
        facets["processing_level"] = "biasadjusted"
×
581
        facets["bias_adjust_project"] = "CanDCS-U6"
×
582
        facets["source"] = data["GCM__source_id"]
×
583
        facets["type"] = "simulation"
×
584
        facets["variable"] = variable
×
585

586
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
587
        if facets["version"] is None:
×
588
            facets.update(find_version_hash(file=file))
×
589

590
        facets.update(cls._decode_hour_of_day_info(file=file))
×
591

592
        try:
×
593
            facets["frequency"] = cls._decode_time_info(
×
594
                data=data, file=file, field="frequency"
595
            )
596
            facets["timedelta"] = cls._decode_time_info(
×
597
                term=facets["frequency"], field="timedelta"
598
            )
599
            facets["date_start"] = date_parser(date)
×
600
            facets["date_end"] = date_parser(date, end_of_period=True)
×
601
        except DecoderError:
×
602
            pass
×
603

604
        return facets
×
605

606
    @classmethod
4✔
607
    def decode_cmip6(cls, file: PathLike | str) -> dict:
4✔
608
        facets = dict()
×
609
        try:
×
610
            variable, date, data = cls._from_dataset(file=file)
×
611
        except DecoderError:
×
612
            return facets
×
613

614
        facets["activity"] = data["activity_id"]
×
615
        facets["date"] = date
×
616
        facets["domain"] = "global"
×
617
        facets["experiment"] = data["experiment_id"]
×
618
        facets["format"] = "netcdf"
×
619
        facets["grid_label"] = data["grid_label"]
×
620
        facets["institution"] = data["institution_id"]
×
621
        facets["member"] = data["variant_label"]
×
622
        facets["modeling_realm"] = data["realm"]
×
623
        facets["processing_level"] = "raw"
×
624
        facets["mip_era"] = data["mip_era"]
×
625
        facets["source"] = data["source_id"]
×
626
        facets["type"] = "simulation"
×
627
        facets["variable"] = variable
×
628
        facets.update(cls._decode_version(data=data, file=file))
×
629
        facets.update(cls._decode_hour_of_day_info(file=file))
×
630

631
        try:
×
632
            facets["frequency"] = cls._decode_time_info(
×
633
                data=data, file=file, field="frequency"
634
            )
635
            facets["timedelta"] = cls._decode_time_info(
×
636
                term=facets["frequency"], field="timedelta"
637
            )
638
            facets["date_start"] = date_parser(date)
×
639
            facets["date_end"] = date_parser(date, end_of_period=True)
×
640
        except DecoderError:
×
641
            pass
×
642

643
        return facets
×
644

645
    @classmethod
4✔
646
    def decode_cmip5(cls, file: PathLike | str) -> dict:
4✔
647
        facets = dict()
×
648
        try:
×
649
            variable, date, data = cls._from_dataset(file=file)
×
650
        except DecoderError:
×
651
            return facets
×
652

653
        facets["activity"] = "CMIP"
×
654
        facets["date"] = date
×
655
        facets["domain"] = "global"
×
656
        facets["experiment"] = data["experiment_id"]
×
657
        facets["format"] = "netcdf"
×
658
        facets["institution"] = data["institute_id"]
×
659
        facets["member"] = data["parent_experiment_rip"]
×
660
        facets["modeling_realm"] = data["modeling_realm"]
×
661
        facets["processing_level"] = "raw"
×
662
        facets["mip_era"] = data["project_id"]
×
663
        facets["source"] = data["model_id"]
×
664
        facets["type"] = "simulation"
×
665
        facets["variable"] = variable
×
666
        facets.update(cls._decode_version(data=data, file=file))
×
667
        facets.update(cls._decode_hour_of_day_info(file=file))
×
668

669
        try:
×
670
            facets["frequency"] = cls._decode_time_info(
×
671
                data=data, file=file, field="frequency"
672
            )
673
            facets["timedelta"] = cls._decode_time_info(
×
674
                term=facets["frequency"], field="timedelta"
675
            )
676
            facets["date_start"] = date_parser(date)
×
677
            facets["date_end"] = date_parser(date, end_of_period=True)
×
678
        except DecoderError:
×
679
            pass
×
680

681
        return facets
×
682

683
    @classmethod
4✔
684
    def decode_cordex(cls, file: PathLike | str) -> dict:
4✔
685
        facets = dict()
×
686
        try:
×
687
            variable, date, data = cls._from_dataset(file=file)
×
688
        except DecoderError:
×
689
            return dict()
×
690

691
        # FIXME: What to do about our internal data that breaks all established conventions?
692
        facets["activity"] = "CORDEX"
×
693

694
        if data.get("project_id") == "" or data.get("project_id") is None:
×
695
            facets["mip_era"] = "internal"
×
696
        elif data.get("project_id") == "CORDEX":
×
697
            facets["mip_era"] = "CMIP5"
×
698

699
        if date == "r0i0p0":
×
700
            facets["date"] = "fx"
×
701
        else:
702
            facets["date"] = date
×
703

704
        domain = data.get("CORDEX_domain")
×
705
        if domain:
×
706
            facets["domain"] = domain.strip()
×
707
        else:
708
            domain = data.get("ouranos_domain_name")
×
709
            if domain:
×
710
                facets["domain"] = domain.strip()
×
711
            else:
712
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
713
                logging.error(msg)
×
714
                raise NotImplementedError(msg)
×
715

716
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
717
        aws_keys = data.get("intake_esm_dataset_key")
×
718
        if aws_keys:
×
719
            facets["domain"] = aws_keys.split(".")[3]
×
720

721
        title = data.get("title")
×
722
        if title:
×
723
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
724
            if regridded_domain_found:
×
725
                facets["domain"] = regridded_domain_found.group()
×
726

727
        # The logic here is awful, but the information is bad to begin with.
728
        driving_model = ""
×
729
        driving_institution = ""
×
730

731
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
732
        if VALIDATION_ENABLED:
×
733
            if driving_institution_parts[0] in INSTITUTIONS:
×
734
                driving_institution = driving_institution_parts[0]
×
735
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
736
                driving_institution = "-".join(driving_institution_parts[:2])
×
737
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
738
                driving_institution = "-".join(driving_institution_parts[:3])
×
739
        else:
740
            logging.warning(
×
741
                "CORDEX Metadata validation checks require PyESSV. "
742
                "Driving institution cannot be determined."
743
            )
744
            driving_model = data["driving_model_id"]
×
745

746
        if data["driving_model_id"].startswith("GFDL"):
×
747
            driving_institution = "NOAA-GFDL"
×
748
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
749
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
750
            driving_institution = "MPI-M"
×
751
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
752
        elif data["driving_model_id"].startswith("HadGEM2"):
×
753
            driving_institution = "MOHC"
×
754
            driving_model = f"MOHC-{data['driving_model_id']}"
×
755
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
756
            driving_institution = "CNRM-CERFACS"
×
757
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
758

759
        elif VALIDATION_ENABLED and not driving_institution:
×
760
            raise DecoderError(
×
761
                "driving_institution (from driving_model_id: "
762
                f"`{data['driving_model_id']}`) is not valid."
763
            )
764

765
        facets["driving_institution"] = driving_institution.strip()
×
766
        if driving_model:
×
767
            facets["driving_model"] = driving_model.strip()
×
768
        else:
769
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
770

771
        facets["format"] = "netcdf"
×
772

773
        if data["institute_id"].strip() == "Our.":
×
774
            facets["institution"] = "Ouranos"
×
775
        else:
776
            facets["institution"] = data["institute_id"].strip()
×
777

778
        facets["processing_level"] = "raw"
×
779
        facets["source"] = data["model_id"]
×
780
        facets["type"] = "simulation"
×
781
        facets["variable"] = variable
×
782

783
        facets.update(cls._decode_version(data=data, file=file))
×
784
        facets.update(cls._decode_hour_of_day_info(file=file))
×
785

786
        try:
×
787
            facets["frequency"] = cls._decode_time_info(
×
788
                data=data, file=file, field="frequency"
789
            )
790
            facets["timedelta"] = cls._decode_time_info(
×
791
                term=facets["frequency"], field="timedelta"
792
            )
793
            facets["date_start"] = date_parser(date)
×
794
            facets["date_end"] = date_parser(date, end_of_period=True)
×
795
        except DecoderError:
×
796
            pass
×
797

798
        try:
×
799
            facets["experiment"] = data["experiment_id"].strip()
×
800
        except KeyError:
×
801
            facets["experiment"] = data["driving_experiment_name"].strip()
×
802

803
        try:
×
804
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
805
                facets["member"] = data.get(potential_member)
×
806
                if facets["member"] == "N/A":
×
807
                    raise KeyError()
×
808
                else:
809
                    break
×
810
            if facets["member"] is None:
×
811
                raise KeyError()
×
812
        except KeyError:
×
813
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
814

815
        return facets
×
816

817
    @classmethod
4✔
818
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
4✔
819
        facets = dict()
×
820
        try:
×
821
            variable, date, data = cls._from_dataset(file=file)
×
822
        except DecoderError:
×
823
            return facets
×
824

825
        facets["activity"] = "ISIMIP"
×
826
        facets["mip_era"] = data["project_id"]
×
827
        facets["date"] = date
×
828
        facets["domain"] = "global"
×
829
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
830
        facets["experiment"] = data["experiment_id"]
×
831
        facets["format"] = "netcdf"
×
832
        facets["impact_model"] = data["impact_model_id"]
×
833
        facets["institution"] = data["institute_id"]
×
834
        facets["member"] = data["driving_model_ensemble_member"]
×
835
        facets["modeling_realm"] = data["modeling_realm"]
×
836
        facets["social_forcing_id"] = data["social_forcing_id"]
×
837
        facets["source"] = data["model_id"]
×
838
        facets["type"] = "simulation"
×
839
        facets["variable"] = variable
×
840

841
        facets.update(cls._decode_version(data=data, file=file))
×
842
        facets.update(cls._decode_hour_of_day_info(file=file))
×
843

844
        try:
×
845
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
846
            facets["timedelta"] = cls._decode_time_info(
×
847
                term=facets["frequency"], field="timedelta"
848
            )
849
            facets["date_start"] = date_parser(date)
×
850
            facets["date_end"] = date_parser(date, end_of_period=True)
×
851
        except DecoderError:
×
852
            pass
×
853

854
        return facets
×
855

856
    @classmethod
4✔
857
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
4✔
858
        facets = dict()
×
859
        try:
×
860
            variable, date, data = cls._from_dataset(file=file)
×
861
        except DecoderError:
×
862
            return facets
×
863

864
        facets["experiment"] = data["scenario"]
×
865
        facets["activity"] = (
×
866
            "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
867
        )
868
        facets["institution"] = data["cmip6_institution_id"]
×
869
        facets["member"] = data["variant_label"]
×
870
        facets["processing_level"] = "biasadjusted"
×
871
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
872
        facets["bias_adjust_institution"] = "NASA"
×
873
        facets["mip_era"] = "CMIP6"
×
874
        facets["source"] = data["cmip6_source_id"]
×
875
        facets["type"] = "simulation"
×
876
        facets["variable"] = variable
×
877
        facets.update(cls._decode_version(data=data, file=file))
×
878
        facets.update(cls._decode_hour_of_day_info(file=file))
×
879

880
        try:
×
881
            facets["frequency"] = cls._decode_time_info(
×
882
                data=data, file=file, field="frequency"
883
            )
884
            facets["timedelta"] = cls._decode_time_info(
×
885
                term=facets["frequency"], field="timedelta"
886
            )
887
            facets["date_start"] = date_parser(date)
×
888
            facets["date_end"] = date_parser(date, end_of_period=True)
×
889
        except DecoderError:
×
890
            pass
×
891

892
        return facets
×
893

894
    @classmethod
4✔
895
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
4✔
896
        facets = dict()
×
897
        try:
×
898
            variable, date, data = cls._from_dataset(file=file)
×
899
        except DecoderError:
×
900
            return facets
×
901

902
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
903
        facets["processing_level"] = "biasadjusted"
×
904
        facets["version"] = "1.0.0"
×
905
        facets["domain"] = "NAM"
×
906
        for f in [
×
907
            "experiment",
908
            "activity",
909
            "institution",
910
            "member",
911
            "bias_adjust_institution",
912
            "mip_era",
913
            "source",
914
            "type",
915
        ]:
916
            facets[f] = data[f"cat:{f}"]
×
917
        facets["variable"] = variable
×
918
        # facets.update(cls._decode_version(data=data, file=file))
919
        facets.update(cls._decode_hour_of_day_info(file=file))
×
920

921
        try:
×
922
            facets["frequency"] = cls._decode_time_info(
×
923
                data=data, file=file, field="frequency"
924
            )
925
            facets["timedelta"] = cls._decode_time_info(
×
926
                term=facets["frequency"], field="timedelta"
927
            )
928
            facets["date_start"] = date_parser(date)
×
929
            facets["date_end"] = date_parser(date, end_of_period=True)
×
930
        except DecoderError:
×
931
            pass
×
932

933
        return facets
×
934

935
    @classmethod
4✔
936
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
4✔
937
        facets = dict()
×
938
        try:
×
939
            variable, date, data = cls._from_dataset(file=file)
×
940
        except DecoderError:
×
941
            return facets
×
942

943
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
944
        facets["processing_level"] = "biasadjusted"
×
945
        facets["version"] = "1.0.0"
×
946
        facets["domain"] = "NAM"
×
947
        for f in [
×
948
            "experiment",
949
            "activity",
950
            "institution",
951
            "member",
952
            "bias_adjust_institution",
953
            "mip_era",
954
            "source",
955
            "type",
956
        ]:
957
            facets[f] = data[f"cat:{f}"]
×
958
        facets["variable"] = variable
×
959
        # facets.update(cls._decode_version(data=data, file=file))
960
        facets.update(cls._decode_hour_of_day_info(file=file))
×
961

962
        try:
×
963
            facets["frequency"] = cls._decode_time_info(
×
964
                data=data, file=file, field="frequency"
965
            )
966
            facets["timedelta"] = cls._decode_time_info(
×
967
                term=facets["frequency"], field="timedelta"
968
            )
969
            facets["date_start"] = date_parser(date)
×
970
            facets["date_end"] = date_parser(date, end_of_period=True)
×
971
        except DecoderError:
×
972
            pass
×
973

974
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc