• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 11667553268

04 Nov 2024 03:38PM UTC coverage: 18.382%. Remained the same
11667553268

Pull #201

github

web-flow
Merge ce5e6b9a0 into f270ec6c2
Pull Request #201: Bump tox-gh from 1.3.2 to 1.4.1

920 of 5005 relevant lines covered (18.38%)

1.36 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.93
/src/miranda/decode/_decoder.py
1
from __future__ import annotations
8✔
2

3
import logging
8✔
4
import multiprocessing as mp
8✔
5
import os
8✔
6
import re
8✔
7
import warnings
8✔
8
from functools import partial
8✔
9
from logging import config
8✔
10
from os import PathLike
8✔
11
from pathlib import Path
8✔
12
from types import GeneratorType
8✔
13
from typing import Optional
8✔
14

15
import netCDF4 as nc  # noqa
8✔
16
import pandas as pd
8✔
17
import schema
8✔
18
import xarray as xr
8✔
19
import zarr
8✔
20
from pandas._libs.tslibs import NaTType  # noqa
8✔
21

22
from miranda.convert.utils import date_parser, find_version_hash  # noqa
8✔
23
from miranda.cv import VALIDATION_ENABLED
8✔
24
from miranda.scripting import LOGGING_CONFIG
8✔
25
from miranda.units import get_time_frequency
8✔
26

27
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
8✔
28

29
if VALIDATION_ENABLED:
8✔
30
    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
4✔
31
    from miranda.validators import FACETS_SCHEMA  # noqa
4✔
32

33

34
config.dictConfig(LOGGING_CONFIG)
8✔
35

36
__all__ = [
8✔
37
    "Decoder",
38
    "guess_project",
39
]
40

41

42
def guess_project(file: os.PathLike | str) -> str:
8✔
43
    """Guess the name of the project
44

45
    Parameters
46
    ----------
47
    file : str or os.PathLike
48

49
    Returns
50
    -------
51
    str
52
    """
53
    file_name = Path(file).stem
×
54

55
    potential_names = file_name.split("_")
×
56
    if VALIDATION_ENABLED:
×
57
        for project, models in PROJECT_MODELS.items():
×
58
            if any([model in potential_names for model in models]):
×
59
                return project
×
60
        raise DecoderError(
×
61
            f"Unable to determine project from file name: '{file_name}'."
62
        )
63
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
64

65

66
class Decoder:
8✔
67
    project = None
8✔
68
    guess = False
8✔
69
    _file_facets = dict()
8✔
70

71
    def __init__(self, project: str | None):
8✔
72
        self.project = project
×
73

74
    @staticmethod
8✔
75
    def _decoder(
8✔
76
        d: dict,
77
        fail_early: bool,
78
        proj: str,
79
        guess: bool,
80
        lock: mp.Lock,
81
        file: str | Path,
82
    ) -> None:
83
        if proj is None:
×
84
            if guess:
×
85
                try:
×
86
                    proj = guess_project(file)
×
87
                except DecoderError:
×
88
                    print(
×
89
                        "Unable to determine 'activity': Signature for 'activity' must be set manually for file: "
90
                        f"{file}."
91
                    )
92
                    if fail_early:
×
93
                        raise
×
94
            else:
95
                proj = "converted"
×
96

97
        decode_function_name = f"decode_{proj.lower().replace('-', '_')}"
×
98
        try:
×
99
            with lock:
×
100
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
101
                if fail_early:
×
102
                    if VALIDATION_ENABLED:
×
103
                        FACETS_SCHEMA.validate(_deciphered)
×
104
                    else:
105
                        print(
×
106
                            "Validation requires pyessv-archive source files. Skipping validation checks."
107
                        )
108
                print(
×
109
                    f"Deciphered the following from {Path(file).name}:\n"
110
                    f"{_deciphered.items()}"
111
                )
112
                d[file] = _deciphered
×
113

114
        except (AttributeError, NotImplementedError):
×
115
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
116
            raise
×
117
        except schema.SchemaError as e:
×
118
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
119

120
    def decode(
8✔
121
        self,
122
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
123
        chunks: int | None = None,
124
        raise_error: bool = False,
125
    ) -> None:
126
        """Decode facets from file or list of files.
127

128
        Parameters
129
        ----------
130
        files : str or Path or list of str or Path or generator
131
            The files to decode.
132
        chunks : int, optional
133
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
134
        raise_error : bool
135
            Whether to raise an error if a file cannot be decoded.
136
        """
137
        if isinstance(files, (str, os.PathLike)):
×
138
            files = [files]
×
139

140
        if chunks is None and isinstance(files, list):
×
141
            if len(files) >= 10:
×
142
                chunk_size = 10
×
143
            elif 1 <= len(files) < 10:
×
144
                chunk_size = len(files)
×
145
            else:
146
                raise ValueError("No file entries found.")
×
147
        elif isinstance(files, GeneratorType):
×
148
            chunk_size = 10
×
149
        else:
150
            chunk_size = chunks
×
151

152
        if self.project is None:
×
153
            warnings.warn(
×
154
                "The decoder 'project' is not set; Decoding step will be much slower."
155
            )
156
        else:
157
            msg = f"Deciphering metadata with project = '{self.project}'"
×
158
            logging.info(msg)
×
159

160
        with mp.Manager() as manager:
×
161
            _file_facets = manager.dict()
×
162
            lock = manager.Lock()
×
163
            func = partial(
×
164
                self._decoder, _file_facets, raise_error, self.project, self.guess, lock
165
            )
166

167
            with mp.Pool() as pool:
×
168
                pool.imap(func, files, chunksize=chunk_size)
×
169
                pool.close()
×
170
                pool.join()
×
171

172
            self._file_facets.update(_file_facets)
×
173

174
    def facets_table(self):
8✔
175
        raise NotImplementedError()
×
176

177
    def file_facets(self) -> dict[os.PathLike, dict]:
8✔
178
        return self._file_facets
×
179

180
    @classmethod
8✔
181
    def _from_dataset(cls, file: Path | str) -> (str, str, dict):
8✔
182
        file_name = Path(file).stem
×
183

184
        try:
×
185
            variable_name = cls._decode_primary_variable(file)
×
186
        except DecoderError:
×
187
            msg = f"Unable to open dataset: {file.name}"
×
188
            logging.error(msg)
×
189
            raise
×
190

191
        datetimes = file_name.split("_")[-1]
×
192

193
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
194
            with nc.Dataset(file, mode="r") as ds:
×
195
                data = dict()
×
196
                for k in ds.ncattrs():
×
197
                    data[k] = getattr(ds, k)
×
198
        elif file.is_dir() and file.suffix == ".zarr":
×
199
            with zarr.open(file, mode="r") as ds:
×
200
                data = ds.attrs.asdict()
×
201
        else:
202
            raise DecoderError(f"Unable to read dataset: `{file.name}`.")
×
203
        return variable_name, datetimes, data
×
204

205
    @staticmethod
8✔
206
    def _decode_primary_variable(file: Path) -> str:
8✔
207
        """Attempts to find the primary variable of a netCDF
208

209
        Parameters
210
        ----------
211
        file: Path
212

213
        Returns
214
        -------
215
        str
216
        """
217
        dimsvar_dict = dict()
×
218
        coords = (
×
219
            "height",
220
            "lat",
221
            "latitude",
222
            "lev",
223
            "level",
224
            "lon",
225
            "longitude",
226
            "rlat",
227
            "rlon",
228
            "rotated_pole",
229
            "time",
230
        )
231
        try:
×
232
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
233
                with nc.Dataset(file, mode="r") as ds:
×
234
                    for var_name, var_attrs in ds.variables.items():
×
235
                        dimsvar_dict[var_name] = {
×
236
                            k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()
237
                        }
238
                for k in dimsvar_dict.keys():
×
239
                    if not str(k).startswith(coords) and k in file.stem:
×
240
                        return str(k)
×
241

242
            elif file.is_dir() and file.suffix == ".zarr":
×
243
                with zarr.open(str(file), mode="r") as ds:
×
244
                    for k in ds.array_keys():
×
245
                        if not str(k).startswith(coords) and k in file.stem:
×
246
                            return str(k)
×
247
            else:
248
                raise NotImplementedError()
×
249
        except ValueError:
×
250
            raise DecoderError()
×
251

252
    @staticmethod
8✔
253
    def _decode_hour_of_day_info(
8✔
254
        file: PathLike | str,
255
    ) -> dict:
256
        """Decode hour of day information.
257

258
        Parameters
259
        ----------
260
        file : Path or str
261

262
        Returns
263
        -------
264
        dict
265
        """
266
        if isinstance(file, str):
×
267
            file = Path(file)
×
268

269
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
270
            with nc.Dataset(file, mode="r") as ds:
×
271
                if "time" in ds.variables.keys():
×
272
                    hour = nc.num2date(
×
273
                        ds["time"][0], ds["time"].units, ds["time"].calendar
274
                    ).hour
275
                else:
276
                    hour = None
×
277
            return dict(hour_of_day=hour)
×
278

279
        elif file.is_dir() and file.suffix == ".zarr":
×
280
            warnings.warn("This is not currently implemented")
×
281

282
            # with zarr.open(str(file), mode="r") as ds:
283
            #     if "time" in ds.array_keys():
284
            #         pass
285

286
            return dict()
×
287

288
        else:
289
            raise NotImplementedError()
×
290

291
    @staticmethod
8✔
292
    def _decode_time_info(  # noqa: C901
8✔
293
        file: PathLike | str | list[str] | None = None,
294
        data: dict | None = None,
295
        term: str | None = None,
296
        *,
297
        field: str | None = None,
298
    ) -> str | NaTType:
299
        """Decode time information.
300

301
        Parameters
302
        ----------
303
        file : os.PathLike or str, optional
304
        data : dict, optional
305
        term : str
306
        field : {"timedelta", "frequency"}
307

308
        Returns
309
        -------
310
        str or NaTType
311
        """
312
        if not file and not data and not term:
×
313
            raise ValueError("Nothing passed to parse time info from.")
×
314

315
        if field == "frequency":
×
316
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
317
        elif field == "timedelta":
×
318
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
319
        else:
320
            raise NotImplementedError()
×
321

322
        if term:
×
323
            if term in ["fx", "fixed"]:
×
324
                if field == "timedelta":
×
325
                    return pd.NaT
×
326
                return "fx"
×
327
            return pd.to_timedelta(time_dictionary[term])
×
328

329
        if data and not file:
×
330
            potential_time = data.get("frequency")
×
331
            if not potential_time:
×
332
                if hasattr(data, "time"):
×
333
                    time_units = data["time"].units
×
334
                    potential_time = time_units.split()[0]
×
335
                else:
336
                    msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
337

338
                    logging.warning(msg)
×
339
                    potential_time = "fx"
×
340
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
341
                msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
342
                logging.warning(msg)
×
343
                if field == "frequency":
×
344
                    return "fx"
×
345
                if field == "timedelta":
×
346
                    return pd.NaT
×
347
                raise ValueError()
×
348

349
            if field == "timedelta":
×
350
                if potential_time in ["fx", "fixed"]:
×
351
                    return pd.NaT
×
352
                return pd.to_timedelta(time_dictionary[potential_time])
×
353
            return time_dictionary[potential_time]
×
354

355
        if file and not data:
×
356
            for delimiter in ["_", "."]:
×
357
                file_parts = Path(file).stem.split(delimiter)
×
358
                potential_times = [
×
359
                    segment
360
                    for segment in file_parts
361
                    if segment in time_dictionary.keys()
362
                ]
363
                if potential_times:
×
364
                    if potential_times[0] in ["fx", "fixed"]:
×
365
                        if field == "frequency":
×
366
                            return "fx"
×
367
                        if field == "timedelta":
×
368
                            return pd.NaT
×
369
                        raise ValueError(f"Field `{field}` not supported.")
×
370
                    if field == "timedelta":
×
371
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
372
                    return time_dictionary[potential_times[0]]
×
373

374
        if file and data:
×
375
            for delimiter in ["_", "."]:
×
376
                file_parts = Path(file).stem.split(delimiter)
×
377
                potential_times = [
×
378
                    segment
379
                    for segment in file_parts
380
                    if segment in time_dictionary.keys()
381
                ]
382
                potential_time = data.get("frequency", "")
×
383
                if potential_time == "":
×
384
                    if hasattr(data, "time"):
×
385
                        time_units = data["time"].units
×
386
                        potential_time = time_units.split()[0]
×
387
                    else:
388
                        msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
389

390
                        logging.warning(msg)
×
391
                        potential_time = "fx"
×
392
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
393
                    msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
394

395
                    logging.warning(msg)
×
396
                    if "fx" in file_parts or "fixed" in file_parts:
×
397
                        if field == "frequency":
×
398
                            return "fx"
×
399
                        if field == "timedelta":
×
400
                            return pd.NaT
×
401
                        raise ValueError(f"Field `{field}` not supported.")
×
402

403
                if potential_time in potential_times:
×
404
                    return time_dictionary[potential_time]
×
405
                elif potential_times:
×
406
                    break
×
407

408
            msg = (
×
409
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
410
                "Performing more rigorous frequency checks."
411
            )
412
            logging.warning(msg)
×
413
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
414
                engine = "netcdf4"
×
415
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
416
                engine = "zarr"
×
417
            else:
418
                raise DecoderError(
×
419
                    f"File is not valid netcdf or zarr: {Path(file).name}"
420
                )
421

422
            _ds = xr.open_dataset(
×
423
                file,
424
                engine=engine,
425
                drop_variables="time_bnds",
426
            )
427
            if not hasattr(_ds, "time"):
×
428
                logging.warning(
×
429
                    "Dataset does not contain time array. Assuming fixed variable."
430
                )
431
                if field == "frequency":
×
432
                    return "fx"
×
433
                if field == "timedelta":
×
434
                    return pd.NaT
×
435
                raise ValueError(f"Field `{field}` not supported.")
×
436
            else:
437
                _, found_freq = get_time_frequency(_ds.time)
×
438

439
            if found_freq in potential_times:
×
440
                msg = (
×
441
                    "Time frequency found in dataset on analysis was found in filename. "
442
                    f"Metadata for `{Path(file).name} is probably incorrect. "
443
                    f"Basing fields on `{found_freq}`."
444
                )
445
                logging.warning(msg)
×
446
                return time_dictionary[found_freq]
×
447
            elif found_freq in ["month", "mon"]:
×
448
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
449
                    if f in potential_times:
×
450
                        msg = f"Month-like time frequency found in dataset on analysis was found in filename. Basing fields on `{f}`."
×
451
                        logging.warning(msg)
×
452
                        return time_dictionary[f]
×
453
            else:
454
                msg = (
×
455
                    "Time frequency found in dataset on analysis was not found in filename. "
456
                    f"Basing fields on `{found_freq}`."
457
                )
458
                logging.warning(msg)
×
459
                return time_dictionary[found_freq]
×
460
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
461

462
    @staticmethod
8✔
463
    def _decode_version(file: PathLike | str, data: dict) -> dict:
8✔
464
        """Decode version information.
465

466
        Parameters
467
        ----------
468
        file : os.PathLike or str
469
        data : dict
470

471
        Returns
472
        -------
473
        dict
474
        """
475
        version_info = dict()
×
476
        try:
×
477
            version_info["version"] = data["version"]
×
478
        except KeyError:
×
479
            possible_version = Path(file).parent
×
480
            if re.match(r"^[vV]\d+", possible_version.name):
×
481
                version_info["version"] = possible_version.name
×
482
            else:
483
                possible_version_signature = possible_version.glob(
×
484
                    f"{Path(file).stem}.v*"
485
                )
486
                for sig in possible_version_signature:
×
487
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
488
                    if found_version:
×
489
                        version_info["version"] = found_version.group()
×
490
                        version_info["sha256sum"] = sig.open().read()
×
491
                        break
×
492
                else:
493
                    version_info["version"] = "vNotFound"
×
494
        return version_info
×
495

496
    @classmethod
8✔
497
    def decode_converted(cls, file: PathLike | str) -> dict:
8✔
498
        """Decode converted data.
499

500
        Parameters
501
        ----------
502
        file : os.PathLike or str
503

504
        Returns
505
        -------
506
        dict
507
        """
508
        facets = dict()
×
509
        try:
×
510
            variable, date, data = cls._from_dataset(file=file)
×
511
        except DecoderError:
×
512
            return facets
×
513

514
        facets.update(data)
×
515
        del facets["history"]
×
516

517
        facets["date"] = date
×
518

519
        file_format = data.get("output_format")
×
520
        if file_format:
×
521
            facets["format"] = file_format
×
522
        elif "format" in data:
×
523
            facets["format"] = data["format"]
×
524
        elif Path(file).suffix in [".nc", ".nc4"]:
×
525
            facets["format"] = "nc"
×
526
        elif Path(file).suffix in [".zarr"]:
×
527
            facets["format"] = "zarr"
×
528
        facets["variable"] = variable
×
529

530
        facets.update(cls._decode_version(data=data, file=file))
×
531
        facets.update(cls._decode_hour_of_day_info(file=file))
×
532

533
        try:
×
534
            if "frequency" not in facets:
×
535
                facets["timedelta"] = cls._decode_time_info(
×
536
                    data=data, file=file, field="frequency"
537
                )
538
            facets["timedelta"] = cls._decode_time_info(
×
539
                term=facets["frequency"], field="timedelta"
540
            )
541
            facets["date_start"] = date_parser(date)
×
542
            facets["date_end"] = date_parser(date, end_of_period=True)
×
543
        except DecoderError:  # noqa: S110
×
544
            pass
×
545

546
        return facets
×
547

548
    @staticmethod
8✔
549
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
8✔
550
        raise NotImplementedError()
×
551

552
    @staticmethod
8✔
553
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
8✔
554
        raise NotImplementedError()
×
555

556
    @staticmethod
8✔
557
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
8✔
558
        raise NotImplementedError()
×
559

560
    @classmethod
8✔
561
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
8✔
562
        if "Derived" in Path(file).parents:
×
563
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
564

565
        facets = dict()
×
566
        try:
×
567
            variable, date, data = cls._from_dataset(file=file)
×
568
        except DecoderError:
×
569
            return facets
×
570

571
        facets["activity"] = data["activity_id"]
×
572
        facets["mip_era"] = data["project_id"]
×
573
        facets["bias_adjust_institution"] = "PCIC"
×
574
        facets["date"] = date
×
575
        facets["domain"] = data["domain"]
×
576
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
577
        facets["format"] = "netcdf"
×
578
        facets["institution"] = data["GCM__institution_id"]
×
579
        facets["member"] = (
×
580
            f"r{data['GCM__realization_index']}"
581
            f"i{data['GCM__initialization_index']}"
582
            f"p{data['GCM__physics_index']}"
583
            f"f{data['GCM__forcing_index']}"
584
        )
585
        facets["processing_level"] = "biasadjusted"
×
586
        facets["bias_adjust_project"] = "CanDCS-U6"
×
587
        facets["source"] = data["GCM__source_id"]
×
588
        facets["type"] = "simulation"
×
589
        facets["variable"] = variable
×
590

591
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
592
        if facets["version"] is None:
×
593
            facets.update(find_version_hash(file=file))
×
594

595
        facets.update(cls._decode_hour_of_day_info(file=file))
×
596

597
        try:
×
598
            facets["frequency"] = cls._decode_time_info(
×
599
                data=data, file=file, field="frequency"
600
            )
601
            facets["timedelta"] = cls._decode_time_info(
×
602
                term=facets["frequency"], field="timedelta"
603
            )
604
            facets["date_start"] = date_parser(date)
×
605
            facets["date_end"] = date_parser(date, end_of_period=True)
×
606
        except DecoderError:  # noqa: S110
×
607
            pass
×
608

609
        return facets
×
610

611
    @classmethod
8✔
612
    def decode_cmip6(cls, file: PathLike | str) -> dict:
8✔
613
        facets = dict()
×
614
        try:
×
615
            variable, date, data = cls._from_dataset(file=file)
×
616
        except DecoderError:
×
617
            return facets
×
618

619
        facets["activity"] = data["activity_id"]
×
620
        facets["date"] = date
×
621
        facets["domain"] = "global"
×
622
        facets["experiment"] = data["experiment_id"]
×
623
        facets["format"] = "netcdf"
×
624
        facets["grid_label"] = data["grid_label"]
×
625
        facets["institution"] = data["institution_id"]
×
626
        facets["member"] = data["variant_label"]
×
627
        facets["modeling_realm"] = data["realm"]
×
628
        facets["processing_level"] = "raw"
×
629
        facets["mip_era"] = data["mip_era"]
×
630
        facets["source"] = data["source_id"]
×
631
        facets["type"] = "simulation"
×
632
        facets["variable"] = variable
×
633
        facets.update(cls._decode_version(data=data, file=file))
×
634
        facets.update(cls._decode_hour_of_day_info(file=file))
×
635

636
        try:
×
637
            facets["frequency"] = cls._decode_time_info(
×
638
                data=data, file=file, field="frequency"
639
            )
640
            facets["timedelta"] = cls._decode_time_info(
×
641
                term=facets["frequency"], field="timedelta"
642
            )
643
            facets["date_start"] = date_parser(date)
×
644
            facets["date_end"] = date_parser(date, end_of_period=True)
×
645
        except DecoderError:  # noqa: S110
×
646
            pass
×
647

648
        return facets
×
649

650
    @classmethod
8✔
651
    def decode_cmip5(cls, file: PathLike | str) -> dict:
8✔
652
        facets = dict()
×
653
        try:
×
654
            variable, date, data = cls._from_dataset(file=file)
×
655
        except DecoderError:
×
656
            return facets
×
657

658
        facets["activity"] = "CMIP"
×
659
        facets["date"] = date
×
660
        facets["domain"] = "global"
×
661
        facets["experiment"] = data["experiment_id"]
×
662
        facets["format"] = "netcdf"
×
663
        facets["institution"] = data["institute_id"]
×
664
        facets["member"] = data["parent_experiment_rip"]
×
665
        facets["modeling_realm"] = data["modeling_realm"]
×
666
        facets["processing_level"] = "raw"
×
667
        facets["mip_era"] = data["project_id"]
×
668
        facets["source"] = data["model_id"]
×
669
        facets["type"] = "simulation"
×
670
        facets["variable"] = variable
×
671
        facets.update(cls._decode_version(data=data, file=file))
×
672
        facets.update(cls._decode_hour_of_day_info(file=file))
×
673

674
        try:
×
675
            facets["frequency"] = cls._decode_time_info(
×
676
                data=data, file=file, field="frequency"
677
            )
678
            facets["timedelta"] = cls._decode_time_info(
×
679
                term=facets["frequency"], field="timedelta"
680
            )
681
            facets["date_start"] = date_parser(date)
×
682
            facets["date_end"] = date_parser(date, end_of_period=True)
×
683
        except DecoderError:  # noqa: S110
×
684
            pass
×
685

686
        return facets
×
687

688
    @classmethod
8✔
689
    def decode_cordex(cls, file: PathLike | str) -> dict:
8✔
690
        facets = dict()
×
691
        try:
×
692
            variable, date, data = cls._from_dataset(file=file)
×
693
        except DecoderError:
×
694
            return dict()
×
695

696
        # FIXME: What to do about our internal data that breaks all established conventions?
697
        facets["activity"] = "CORDEX"
×
698

699
        if data.get("project_id") == "" or data.get("project_id") is None:
×
700
            facets["mip_era"] = "internal"
×
701
        elif data.get("project_id") == "CORDEX":
×
702
            facets["mip_era"] = "CMIP5"
×
703

704
        if date == "r0i0p0":
×
705
            facets["date"] = "fx"
×
706
        else:
707
            facets["date"] = date
×
708

709
        domain = data.get("CORDEX_domain")
×
710
        if domain:
×
711
            facets["domain"] = domain.strip()
×
712
        else:
713
            domain = data.get("ouranos_domain_name")
×
714
            if domain:
×
715
                facets["domain"] = domain.strip()
×
716
            else:
717
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
718
                logging.error(msg)
×
719
                raise NotImplementedError(msg)
×
720

721
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
722
        aws_keys = data.get("intake_esm_dataset_key")
×
723
        if aws_keys:
×
724
            facets["domain"] = aws_keys.split(".")[3]
×
725

726
        title = data.get("title")
×
727
        if title:
×
728
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
729
            if regridded_domain_found:
×
730
                facets["domain"] = regridded_domain_found.group()
×
731

732
        # The logic here is awful, but the information is bad to begin with.
733
        driving_model = ""
×
734
        driving_institution = ""
×
735

736
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
737
        if VALIDATION_ENABLED:
×
738
            if driving_institution_parts[0] in INSTITUTIONS:
×
739
                driving_institution = driving_institution_parts[0]
×
740
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
741
                driving_institution = "-".join(driving_institution_parts[:2])
×
742
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
743
                driving_institution = "-".join(driving_institution_parts[:3])
×
744
        else:
745
            logging.warning(
×
746
                "CORDEX Metadata validation checks require PyESSV. "
747
                "Driving institution cannot be determined."
748
            )
749
            driving_model = data["driving_model_id"]
×
750

751
        if data["driving_model_id"].startswith("GFDL"):
×
752
            driving_institution = "NOAA-GFDL"
×
753
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
754
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
755
            driving_institution = "MPI-M"
×
756
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
757
        elif data["driving_model_id"].startswith("HadGEM2"):
×
758
            driving_institution = "MOHC"
×
759
            driving_model = f"MOHC-{data['driving_model_id']}"
×
760
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
761
            driving_institution = "CNRM-CERFACS"
×
762
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
763

764
        elif VALIDATION_ENABLED and not driving_institution:
×
765
            raise DecoderError(
×
766
                "driving_institution (from driving_model_id: "
767
                f"`{data['driving_model_id']}`) is not valid."
768
            )
769

770
        facets["driving_institution"] = driving_institution.strip()
×
771
        if driving_model:
×
772
            facets["driving_model"] = driving_model.strip()
×
773
        else:
774
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
775

776
        facets["format"] = "netcdf"
×
777

778
        if data["institute_id"].strip() == "Our.":
×
779
            facets["institution"] = "Ouranos"
×
780
        else:
781
            facets["institution"] = data["institute_id"].strip()
×
782

783
        facets["processing_level"] = "raw"
×
784
        facets["source"] = data["model_id"]
×
785
        facets["type"] = "simulation"
×
786
        facets["variable"] = variable
×
787

788
        facets.update(cls._decode_version(data=data, file=file))
×
789
        facets.update(cls._decode_hour_of_day_info(file=file))
×
790

791
        try:
×
792
            facets["frequency"] = cls._decode_time_info(
×
793
                data=data, file=file, field="frequency"
794
            )
795
            facets["timedelta"] = cls._decode_time_info(
×
796
                term=facets["frequency"], field="timedelta"
797
            )
798
            facets["date_start"] = date_parser(date)
×
799
            facets["date_end"] = date_parser(date, end_of_period=True)
×
800
        except DecoderError:  # noqa: S110
×
801
            pass
×
802

803
        try:
×
804
            facets["experiment"] = data["experiment_id"].strip()
×
805
        except KeyError:
×
806
            facets["experiment"] = data["driving_experiment_name"].strip()
×
807

808
        try:
×
809
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
810
                facets["member"] = data.get(potential_member)
×
811
                if facets["member"] == "N/A":
×
812
                    raise KeyError()
×
813
                else:
814
                    break
×
815
            if facets["member"] is None:
×
816
                raise KeyError()
×
817
        except KeyError:
×
818
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
819

820
        return facets
×
821

822
    @classmethod
8✔
823
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
8✔
824
        facets = dict()
×
825
        try:
×
826
            variable, date, data = cls._from_dataset(file=file)
×
827
        except DecoderError:
×
828
            return facets
×
829

830
        facets["activity"] = "ISIMIP"
×
831
        facets["mip_era"] = data["project_id"]
×
832
        facets["date"] = date
×
833
        facets["domain"] = "global"
×
834
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
835
        facets["experiment"] = data["experiment_id"]
×
836
        facets["format"] = "netcdf"
×
837
        facets["impact_model"] = data["impact_model_id"]
×
838
        facets["institution"] = data["institute_id"]
×
839
        facets["member"] = data["driving_model_ensemble_member"]
×
840
        facets["modeling_realm"] = data["modeling_realm"]
×
841
        facets["social_forcing_id"] = data["social_forcing_id"]
×
842
        facets["source"] = data["model_id"]
×
843
        facets["type"] = "simulation"
×
844
        facets["variable"] = variable
×
845

846
        facets.update(cls._decode_version(data=data, file=file))
×
847
        facets.update(cls._decode_hour_of_day_info(file=file))
×
848

849
        try:
×
850
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
851
            facets["timedelta"] = cls._decode_time_info(
×
852
                term=facets["frequency"], field="timedelta"
853
            )
854
            facets["date_start"] = date_parser(date)
×
855
            facets["date_end"] = date_parser(date, end_of_period=True)
×
856
        except DecoderError:  # noqa: S110
×
857
            pass
×
858

859
        return facets
×
860

861
    @classmethod
8✔
862
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
8✔
863
        facets = dict()
×
864
        try:
×
865
            variable, date, data = cls._from_dataset(file=file)
×
866
        except DecoderError:
×
867
            return facets
×
868

869
        facets["experiment"] = data["scenario"]
×
870
        facets["activity"] = (
×
871
            "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
872
        )
873
        facets["institution"] = data["cmip6_institution_id"]
×
874
        facets["member"] = data["variant_label"]
×
875
        facets["processing_level"] = "biasadjusted"
×
876
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
877
        facets["bias_adjust_institution"] = "NASA"
×
878
        facets["mip_era"] = "CMIP6"
×
879
        facets["source"] = data["cmip6_source_id"]
×
880
        facets["type"] = "simulation"
×
881
        facets["variable"] = variable
×
882
        facets.update(cls._decode_version(data=data, file=file))
×
883
        facets.update(cls._decode_hour_of_day_info(file=file))
×
884

885
        try:
×
886
            facets["frequency"] = cls._decode_time_info(
×
887
                data=data, file=file, field="frequency"
888
            )
889
            facets["timedelta"] = cls._decode_time_info(
×
890
                term=facets["frequency"], field="timedelta"
891
            )
892
            facets["date_start"] = date_parser(date)
×
893
            facets["date_end"] = date_parser(date, end_of_period=True)
×
894
        except DecoderError:  # noqa: S110
×
895
            pass
×
896

897
        return facets
×
898

899
    @classmethod
8✔
900
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
8✔
901
        facets = dict()
×
902
        try:
×
903
            variable, date, data = cls._from_dataset(file=file)
×
904
        except DecoderError:
×
905
            return facets
×
906

907
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
908
        facets["processing_level"] = "biasadjusted"
×
909
        facets["version"] = "1.0.0"
×
910
        facets["domain"] = "NAM"
×
911
        for f in [
×
912
            "experiment",
913
            "activity",
914
            "institution",
915
            "member",
916
            "bias_adjust_institution",
917
            "mip_era",
918
            "source",
919
            "type",
920
        ]:
921
            facets[f] = data[f"cat:{f}"]
×
922
        facets["variable"] = variable
×
923
        # facets.update(cls._decode_version(data=data, file=file))
924
        facets.update(cls._decode_hour_of_day_info(file=file))
×
925

926
        try:
×
927
            facets["frequency"] = cls._decode_time_info(
×
928
                data=data, file=file, field="frequency"
929
            )
930
            facets["timedelta"] = cls._decode_time_info(
×
931
                term=facets["frequency"], field="timedelta"
932
            )
933
            facets["date_start"] = date_parser(date)
×
934
            facets["date_end"] = date_parser(date, end_of_period=True)
×
935
        except DecoderError:  # noqa: S110
×
936
            pass
×
937

938
        return facets
×
939

940
    @classmethod
8✔
941
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
8✔
942
        facets = dict()
×
943
        try:
×
944
            variable, date, data = cls._from_dataset(file=file)
×
945
        except DecoderError:
×
946
            return facets
×
947

948
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
949
        facets["processing_level"] = "biasadjusted"
×
950
        facets["version"] = "1.0.0"
×
951
        facets["domain"] = "NAM"
×
952
        for f in [
×
953
            "experiment",
954
            "activity",
955
            "institution",
956
            "member",
957
            "bias_adjust_institution",
958
            "mip_era",
959
            "source",
960
            "type",
961
        ]:
962
            facets[f] = data[f"cat:{f}"]
×
963
        facets["variable"] = variable
×
964
        # facets.update(cls._decode_version(data=data, file=file))
965
        facets.update(cls._decode_hour_of_day_info(file=file))
×
966

967
        try:
×
968
            facets["frequency"] = cls._decode_time_info(
×
969
                data=data, file=file, field="frequency"
970
            )
971
            facets["timedelta"] = cls._decode_time_info(
×
972
                term=facets["frequency"], field="timedelta"
973
            )
974
            facets["date_start"] = date_parser(date)
×
975
            facets["date_end"] = date_parser(date, end_of_period=True)
×
976
        except DecoderError:  # noqa: S110
×
977
            pass
×
978

979
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc