• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 15450069722

04 Jun 2025 06:37PM UTC coverage: 17.413%. First build
15450069722

Pull #241

github

web-flow
Merge 6ad70da15 into fc9f3677e
Pull Request #241: Testing Data and Distributed Testing

118 of 199 new or added lines in 7 files covered. (59.3%)

1097 of 6300 relevant lines covered (17.41%)

1.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.78
/src/miranda/decode/_decoder.py
1
from __future__ import annotations
7✔
2

3
import logging
7✔
4
import multiprocessing as mp
7✔
5
import os
7✔
6
import re
7✔
7
import warnings
7✔
8
from functools import partial
7✔
9
from logging import config
7✔
10
from os import PathLike
7✔
11
from pathlib import Path
7✔
12
from types import GeneratorType
7✔
13

14
import netCDF4 as nc  # noqa
7✔
15
import pandas as pd
7✔
16
import schema
7✔
17
import xarray as xr
7✔
18
import zarr
7✔
19
from pandas._libs.tslibs import NaTType  # noqa
7✔
20

21
from miranda.convert.utils import date_parser, find_version_hash  # noqa
7✔
22
from miranda.cv import VALIDATION_ENABLED
7✔
23
from miranda.scripting import LOGGING_CONFIG
7✔
24
from miranda.units import check_time_frequency
7✔
25

26
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
7✔
27

28
if VALIDATION_ENABLED:
7✔
29
    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
3✔
30
    from miranda.validators import FACETS_SCHEMA  # noqa
3✔
31

32

33
config.dictConfig(LOGGING_CONFIG)
7✔
34

35
__all__ = [
7✔
36
    "Decoder",
37
    "guess_project",
38
]
39

40

41
def guess_project(file: os.PathLike | str) -> str:
7✔
42
    """Guess the name of the project
43

44
    Parameters
45
    ----------
46
    file : str or os.PathLike
47

48
    Returns
49
    -------
50
    str
51
    """
52
    file_name = Path(file).stem
×
53

54
    potential_names = file_name.split("_")
×
55
    if VALIDATION_ENABLED:
×
56
        for project, models in PROJECT_MODELS.items():
×
57
            if any([model in potential_names for model in models]):
×
58
                return project
×
59
        raise DecoderError(
×
60
            f"Unable to determine project from file name: '{file_name}'."
61
        )
62
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
63

64

65
class Decoder:
7✔
66
    project = None
7✔
67
    guess = False
7✔
68
    _file_facets = dict()
7✔
69

70
    def __init__(self, project: str | None):
7✔
71
        self.project = project
×
72

73
    @staticmethod
7✔
74
    def _decoder(
7✔
75
        d: dict,
76
        fail_early: bool,
77
        proj: str,
78
        guess: bool,
79
        lock: mp.Lock,
80
        file: str | Path,
81
    ) -> None:
82
        if proj is None:
×
83
            if guess:
×
84
                try:
×
85
                    proj = guess_project(file)
×
86
                except DecoderError:
×
87
                    print(
×
88
                        "Unable to determine 'activity': Signature for 'activity' must be set manually for file: "
89
                        f"{file}."
90
                    )
91
                    if fail_early:
×
92
                        raise
×
93
            else:
94
                proj = "converted"
×
95

96
        decode_function_name = f"decode_{proj.lower().replace('-', '_')}"
×
97
        try:
×
98
            with lock:
×
99
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
100
                if fail_early:
×
101
                    if VALIDATION_ENABLED:
×
102
                        FACETS_SCHEMA.validate(_deciphered)
×
103
                    else:
104
                        print(
×
105
                            "Validation requires pyessv-archive source files. Skipping validation checks."
106
                        )
107
                print(
×
108
                    f"Deciphered the following from {Path(file).name}:\n"
109
                    f"{_deciphered.items()}"
110
                )
111
                d[file] = _deciphered
×
112

113
        except (AttributeError, NotImplementedError):
×
114
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
115
            raise
×
116
        except schema.SchemaError as e:
×
117
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
118

119
    def decode(
7✔
120
        self,
121
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
122
        chunks: int | None = None,
123
        raise_error: bool = False,
124
    ) -> None:
125
        """Decode facets from file or list of files.
126

127
        Parameters
128
        ----------
129
        files : str or Path or list of str or Path or generator
130
            The files to decode.
131
        chunks : int, optional
132
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
133
        raise_error : bool
134
            Whether to raise an error if a file cannot be decoded.
135
        """
136
        if isinstance(files, (str, os.PathLike)):
×
137
            files = [files]
×
138

139
        if chunks is None and isinstance(files, list):
×
140
            if len(files) >= 10:
×
141
                chunk_size = 10
×
142
            elif 1 <= len(files) < 10:
×
143
                chunk_size = len(files)
×
144
            else:
145
                raise ValueError("No file entries found.")
×
146
        elif isinstance(files, GeneratorType):
×
147
            chunk_size = 10
×
148
        else:
149
            chunk_size = chunks
×
150

151
        if self.project is None:
×
152
            warnings.warn(
×
153
                "The decoder 'project' is not set; Decoding step will be much slower."
154
            )
155
        else:
156
            msg = f"Deciphering metadata with project = '{self.project}'"
×
157
            logging.info(msg)
×
158

159
        with mp.Manager() as manager:
×
160
            _file_facets = manager.dict()
×
161
            lock = manager.Lock()
×
162
            func = partial(
×
163
                self._decoder, _file_facets, raise_error, self.project, self.guess, lock
164
            )
165

166
            with mp.Pool() as pool:
×
167
                pool.imap(func, files, chunksize=chunk_size)
×
168
                pool.close()
×
169
                pool.join()
×
170

171
            self._file_facets.update(_file_facets)
×
172

173
    def facets_table(self):
7✔
174
        raise NotImplementedError()
×
175

176
    def file_facets(self) -> dict[os.PathLike, dict]:
7✔
177
        return self._file_facets
×
178

179
    @classmethod
7✔
180
    def _from_dataset(cls, file: Path | str) -> (str, str, dict):
7✔
181
        file_name = Path(file).stem
×
182

183
        try:
×
184
            variable_name = cls._decode_primary_variable(file)
×
185
        except DecoderError:
×
186
            msg = f"Unable to open dataset: {file.name}"
×
187
            logging.error(msg)
×
188
            raise
×
189

190
        datetimes = file_name.split("_")[-1]
×
191

192
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
193
            with nc.Dataset(file, mode="r") as ds:
×
194
                data = dict()
×
195
                for k in ds.ncattrs():
×
196
                    data[k] = getattr(ds, k)
×
197
        elif file.is_dir() and file.suffix == ".zarr":
×
198
            with zarr.open(file, mode="r") as ds:
×
199
                data = ds.attrs.asdict()
×
200
        else:
201
            raise DecoderError(f"Unable to read dataset: `{file.name}`.")
×
202
        return variable_name, datetimes, data
×
203

204
    @staticmethod
7✔
205
    def _decode_primary_variable(file: Path) -> str:
7✔
206
        """Attempts to find the primary variable of a netCDF
207

208
        Parameters
209
        ----------
210
        file: Path
211

212
        Returns
213
        -------
214
        str
215
        """
216
        dimsvar_dict = dict()
×
217
        coords = (
×
218
            "height",
219
            "lat",
220
            "latitude",
221
            "lev",
222
            "level",
223
            "lon",
224
            "longitude",
225
            "rlat",
226
            "rlon",
227
            "rotated_pole",
228
            "time",
229
        )
230
        try:
×
231
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
232
                with nc.Dataset(file, mode="r") as ds:
×
233
                    for var_name, var_attrs in ds.variables.items():
×
234
                        dimsvar_dict[var_name] = {
×
235
                            k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()
236
                        }
237
                for k in dimsvar_dict.keys():
×
238
                    if not str(k).startswith(coords) and k in file.stem:
×
239
                        return str(k)
×
240

241
            elif file.is_dir() and file.suffix == ".zarr":
×
242
                with zarr.open(str(file), mode="r") as ds:
×
243
                    for k in ds.array_keys():
×
244
                        if not str(k).startswith(coords) and k in file.stem:
×
245
                            return str(k)
×
246
            else:
247
                raise NotImplementedError()
×
248
        except ValueError:
×
249
            raise DecoderError()
×
250

251
    @staticmethod
7✔
252
    def _decode_hour_of_day_info(
7✔
253
        file: PathLike | str,
254
    ) -> dict:
255
        """Decode hour of day information.
256

257
        Parameters
258
        ----------
259
        file : Path or str
260

261
        Returns
262
        -------
263
        dict
264
        """
265
        if isinstance(file, str):
×
266
            file = Path(file)
×
267

268
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
269
            with nc.Dataset(file, mode="r") as ds:
×
270
                if "time" in ds.variables.keys():
×
271
                    hour = nc.num2date(
×
272
                        ds["time"][0], ds["time"].units, ds["time"].calendar
273
                    ).hour
274
                else:
275
                    hour = None
×
276
            return dict(hour_of_day=hour)
×
277

278
        elif file.is_dir() and file.suffix == ".zarr":
×
279
            warnings.warn("This is not currently implemented")
×
280

281
            # with zarr.open(str(file), mode="r") as ds:
282
            #     if "time" in ds.array_keys():
283
            #         pass
284

285
            return dict()
×
286

287
        else:
288
            raise NotImplementedError()
×
289

290
    @staticmethod
7✔
291
    def _decode_time_info(  # noqa: C901
7✔
292
        file: PathLike | str | list[str] | None = None,
293
        data: dict | None = None,
294
        term: str | None = None,
295
        *,
296
        field: str | None = None,
297
    ) -> str | NaTType:
298
        """Decode time information.
299

300
        Parameters
301
        ----------
302
        file : os.PathLike or str, optional
303
        data : dict, optional
304
        term : str
305
        field : {"timedelta", "frequency"}
306

307
        Returns
308
        -------
309
        str or NaTType
310
        """
311
        if not file and not data and not term:
×
312
            raise ValueError("Nothing passed to parse time info from.")
×
313

314
        if field == "frequency":
×
315
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
316
        elif field == "timedelta":
×
317
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
318
        else:
319
            raise NotImplementedError()
×
320

321
        if term:
×
322
            if term in ["fx", "fixed"]:
×
323
                if field == "timedelta":
×
324
                    return pd.NaT
×
325
                return "fx"
×
326
            return pd.to_timedelta(time_dictionary[term])
×
327

328
        if data and not file:
×
329
            potential_time = data.get("frequency")
×
330
            if not potential_time:
×
331
                if hasattr(data, "time"):
×
332
                    time_units = data["time"].units
×
333
                    potential_time = time_units.split()[0]
×
334
                else:
335
                    msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
336

337
                    logging.warning(msg)
×
338
                    potential_time = "fx"
×
339
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
340
                msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
341
                logging.warning(msg)
×
342
                if field == "frequency":
×
343
                    return "fx"
×
344
                if field == "timedelta":
×
345
                    return pd.NaT
×
346
                raise ValueError()
×
347

348
            if field == "timedelta":
×
349
                if potential_time in ["fx", "fixed"]:
×
350
                    return pd.NaT
×
351
                return pd.to_timedelta(time_dictionary[potential_time])
×
352
            return time_dictionary[potential_time]
×
353

354
        if file and not data:
×
355
            for delimiter in ["_", "."]:
×
356
                file_parts = Path(file).stem.split(delimiter)
×
357
                potential_times = [
×
358
                    segment
359
                    for segment in file_parts
360
                    if segment in time_dictionary.keys()
361
                ]
362
                if potential_times:
×
363
                    if potential_times[0] in ["fx", "fixed"]:
×
364
                        if field == "frequency":
×
365
                            return "fx"
×
366
                        if field == "timedelta":
×
367
                            return pd.NaT
×
368
                        raise ValueError(f"Field `{field}` not supported.")
×
369
                    if field == "timedelta":
×
370
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
371
                    return time_dictionary[potential_times[0]]
×
372

373
        if file and data:
×
374
            for delimiter in ["_", "."]:
×
375
                file_parts = Path(file).stem.split(delimiter)
×
376
                potential_times = [
×
377
                    segment
378
                    for segment in file_parts
379
                    if segment in time_dictionary.keys()
380
                ]
381
                potential_time = data.get("frequency", "")
×
382
                if potential_time == "":
×
383
                    if hasattr(data, "time"):
×
384
                        time_units = data["time"].units
×
385
                        potential_time = time_units.split()[0]
×
386
                    else:
387
                        msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
388

389
                        logging.warning(msg)
×
390
                        potential_time = "fx"
×
391
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
392
                    msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
393

394
                    logging.warning(msg)
×
395
                    if "fx" in file_parts or "fixed" in file_parts:
×
396
                        if field == "frequency":
×
397
                            return "fx"
×
398
                        if field == "timedelta":
×
399
                            return pd.NaT
×
400
                        raise ValueError(f"Field `{field}` not supported.")
×
401

402
                if potential_time in potential_times:
×
403
                    return time_dictionary[potential_time]
×
404
                elif potential_times:
×
405
                    break
×
406

407
            msg = (
×
408
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
409
                "Performing more rigorous frequency checks."
410
            )
411
            logging.warning(msg)
×
412
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
NEW
413
                engine = "h5netcdf"
×
414
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
415
                engine = "zarr"
×
416
            else:
417
                raise DecoderError(
×
418
                    f"File is not valid netcdf or zarr: {Path(file).name}"
419
                )
420

421
            _ds = xr.open_dataset(
×
422
                file,
423
                engine=engine,
424
                drop_variables="time_bnds",
425
            )
426
            if not hasattr(_ds, "time"):
×
427
                logging.warning(
×
428
                    "Dataset does not contain time array. Assuming fixed variable."
429
                )
430
                if field == "frequency":
×
431
                    return "fx"
×
432
                if field == "timedelta":
×
433
                    return pd.NaT
×
434
                raise ValueError(f"Field `{field}` not supported.")
×
435
            else:
436
                _, found_freq = check_time_frequency(_ds.time)
×
437

438
            if found_freq in potential_times:
×
439
                msg = (
×
440
                    "Time frequency found in dataset on analysis was found in filename. "
441
                    f"Metadata for `{Path(file).name} is probably incorrect. "
442
                    f"Basing fields on `{found_freq}`."
443
                )
444
                logging.warning(msg)
×
445
                return time_dictionary[found_freq]
×
446
            elif found_freq in ["month", "mon"]:
×
447
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
448
                    if f in potential_times:
×
449
                        msg = f"Month-like time frequency found in dataset on analysis was found in filename. Basing fields on `{f}`."
×
450
                        logging.warning(msg)
×
451
                        return time_dictionary[f]
×
452
            else:
453
                msg = (
×
454
                    "Time frequency found in dataset on analysis was not found in filename. "
455
                    f"Basing fields on `{found_freq}`."
456
                )
457
                logging.warning(msg)
×
458
                return time_dictionary[found_freq]
×
459
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
460

461
    @staticmethod
7✔
462
    def _decode_version(file: PathLike | str, data: dict) -> dict:
7✔
463
        """Decode version information.
464

465
        Parameters
466
        ----------
467
        file : os.PathLike or str
468
        data : dict
469

470
        Returns
471
        -------
472
        dict
473
        """
474
        version_info = dict()
×
475
        try:
×
476
            version_info["version"] = data["version"]
×
477
        except KeyError:
×
478
            possible_version = Path(file).parent
×
479
            if re.match(r"^[vV]\d+", possible_version.name):
×
480
                version_info["version"] = possible_version.name
×
481
            else:
482
                possible_version_signature = possible_version.glob(
×
483
                    f"{Path(file).stem}.v*"
484
                )
485
                for sig in possible_version_signature:
×
486
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
487
                    if found_version:
×
488
                        version_info["version"] = found_version.group()
×
489
                        version_info["sha256sum"] = sig.open().read()
×
490
                        break
×
491
                else:
492
                    version_info["version"] = "vNotFound"
×
493
        return version_info
×
494

495
    @classmethod
7✔
496
    def decode_converted(cls, file: PathLike | str) -> dict:
7✔
497
        """Decode converted data.
498

499
        Parameters
500
        ----------
501
        file : os.PathLike or str
502

503
        Returns
504
        -------
505
        dict
506
        """
507
        facets = dict()
×
508
        try:
×
509
            variable, date, data = cls._from_dataset(file=file)
×
510
        except DecoderError:
×
511
            return facets
×
512

513
        facets.update(data)
×
514
        del facets["history"]
×
515

516
        facets["date"] = date
×
517

518
        file_format = data.get("output_format")
×
519
        if file_format:
×
520
            facets["format"] = file_format
×
521
        elif "format" in data:
×
522
            facets["format"] = data["format"]
×
523
        elif Path(file).suffix in [".nc", ".nc4"]:
×
524
            facets["format"] = "nc"
×
525
        elif Path(file).suffix in [".zarr"]:
×
526
            facets["format"] = "zarr"
×
527
        facets["variable"] = variable
×
528

529
        facets.update(cls._decode_version(data=data, file=file))
×
530
        facets.update(cls._decode_hour_of_day_info(file=file))
×
531

532
        try:
×
533
            if "frequency" not in facets:
×
534
                facets["timedelta"] = cls._decode_time_info(
×
535
                    data=data, file=file, field="frequency"
536
                )
537
            facets["timedelta"] = cls._decode_time_info(
×
538
                term=facets["frequency"], field="timedelta"
539
            )
540
            facets["date_start"] = date_parser(date)
×
541
            facets["date_end"] = date_parser(date, end_of_period=True)
×
542
        except DecoderError:  # noqa: S110
×
543
            pass
×
544

545
        return facets
×
546

547
    @staticmethod
7✔
548
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
7✔
549
        raise NotImplementedError()
×
550

551
    @staticmethod
7✔
552
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
7✔
553
        raise NotImplementedError()
×
554

555
    @staticmethod
7✔
556
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
7✔
557
        raise NotImplementedError()
×
558

559
    @classmethod
7✔
560
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
7✔
561
        if "Derived" in Path(file).parents:
×
562
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
563

564
        facets = dict()
×
565
        try:
×
566
            variable, date, data = cls._from_dataset(file=file)
×
567
        except DecoderError:
×
568
            return facets
×
569

570
        facets["activity"] = data["activity_id"]
×
571
        facets["mip_era"] = data["project_id"]
×
572
        facets["bias_adjust_institution"] = "PCIC"
×
573
        facets["date"] = date
×
574
        facets["domain"] = data["domain"]
×
575
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
576
        facets["format"] = "netcdf"
×
577
        facets["institution"] = data["GCM__institution_id"]
×
578
        facets["member"] = (
×
579
            f"r{data['GCM__realization_index']}"
580
            f"i{data['GCM__initialization_index']}"
581
            f"p{data['GCM__physics_index']}"
582
            f"f{data['GCM__forcing_index']}"
583
        )
584
        facets["processing_level"] = "biasadjusted"
×
585
        facets["bias_adjust_project"] = "CanDCS-U6"
×
586
        facets["source"] = data["GCM__source_id"]
×
587
        facets["type"] = "simulation"
×
588
        facets["variable"] = variable
×
589

590
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
591
        if facets["version"] is None:
×
592
            facets.update(find_version_hash(file=file))
×
593

594
        facets.update(cls._decode_hour_of_day_info(file=file))
×
595

596
        try:
×
597
            facets["frequency"] = cls._decode_time_info(
×
598
                data=data, file=file, field="frequency"
599
            )
600
            facets["timedelta"] = cls._decode_time_info(
×
601
                term=facets["frequency"], field="timedelta"
602
            )
603
            facets["date_start"] = date_parser(date)
×
604
            facets["date_end"] = date_parser(date, end_of_period=True)
×
605
        except DecoderError:  # noqa: S110
×
606
            pass
×
607

608
        return facets
×
609

610
    @classmethod
7✔
611
    def decode_cmip6(cls, file: PathLike | str) -> dict:
7✔
612
        facets = dict()
×
613
        try:
×
614
            variable, date, data = cls._from_dataset(file=file)
×
615
        except DecoderError:
×
616
            return facets
×
617

618
        facets["activity"] = data["activity_id"]
×
619
        facets["date"] = date
×
620
        facets["domain"] = "global"
×
621
        facets["experiment"] = data["experiment_id"]
×
622
        facets["format"] = "netcdf"
×
623
        facets["grid_label"] = data["grid_label"]
×
624
        facets["institution"] = data["institution_id"]
×
625
        facets["member"] = data["variant_label"]
×
626
        facets["modeling_realm"] = data["realm"]
×
627
        facets["processing_level"] = "raw"
×
628
        facets["mip_era"] = data["mip_era"]
×
629
        facets["source"] = data["source_id"]
×
630
        facets["type"] = "simulation"
×
631
        facets["variable"] = variable
×
632
        facets.update(cls._decode_version(data=data, file=file))
×
633
        facets.update(cls._decode_hour_of_day_info(file=file))
×
634

635
        try:
×
636
            facets["frequency"] = cls._decode_time_info(
×
637
                data=data, file=file, field="frequency"
638
            )
639
            facets["timedelta"] = cls._decode_time_info(
×
640
                term=facets["frequency"], field="timedelta"
641
            )
642
            facets["date_start"] = date_parser(date)
×
643
            facets["date_end"] = date_parser(date, end_of_period=True)
×
644
        except DecoderError:  # noqa: S110
×
645
            pass
×
646

647
        return facets
×
648

649
    @classmethod
7✔
650
    def decode_cmip5(cls, file: PathLike | str) -> dict:
7✔
651
        facets = dict()
×
652
        try:
×
653
            variable, date, data = cls._from_dataset(file=file)
×
654
        except DecoderError:
×
655
            return facets
×
656

657
        facets["activity"] = "CMIP"
×
658
        facets["date"] = date
×
659
        facets["domain"] = "global"
×
660
        facets["experiment"] = data["experiment_id"]
×
661
        facets["format"] = "netcdf"
×
662
        facets["institution"] = data["institute_id"]
×
663
        facets["member"] = data["parent_experiment_rip"]
×
664
        facets["modeling_realm"] = data["modeling_realm"]
×
665
        facets["processing_level"] = "raw"
×
666
        facets["mip_era"] = data["project_id"]
×
667
        facets["source"] = data["model_id"]
×
668
        facets["type"] = "simulation"
×
669
        facets["variable"] = variable
×
670
        facets.update(cls._decode_version(data=data, file=file))
×
671
        facets.update(cls._decode_hour_of_day_info(file=file))
×
672

673
        try:
×
674
            facets["frequency"] = cls._decode_time_info(
×
675
                data=data, file=file, field="frequency"
676
            )
677
            facets["timedelta"] = cls._decode_time_info(
×
678
                term=facets["frequency"], field="timedelta"
679
            )
680
            facets["date_start"] = date_parser(date)
×
681
            facets["date_end"] = date_parser(date, end_of_period=True)
×
682
        except DecoderError:  # noqa: S110
×
683
            pass
×
684

685
        return facets
×
686

687
    @classmethod
7✔
688
    def decode_cordex(cls, file: PathLike | str) -> dict:
7✔
689
        facets = dict()
×
690
        try:
×
691
            variable, date, data = cls._from_dataset(file=file)
×
692
        except DecoderError:
×
693
            return dict()
×
694

695
        # FIXME: What to do about our internal data that breaks all established conventions?
696
        facets["activity"] = "CORDEX"
×
697

698
        if data.get("project_id") == "" or data.get("project_id") is None:
×
699
            facets["mip_era"] = "internal"
×
700
        elif data.get("project_id") == "CORDEX":
×
701
            facets["mip_era"] = "CMIP5"
×
702

703
        if date == "r0i0p0":
×
704
            facets["date"] = "fx"
×
705
        else:
706
            facets["date"] = date
×
707

708
        domain = data.get("CORDEX_domain")
×
709
        if domain:
×
710
            facets["domain"] = domain.strip()
×
711
        else:
712
            domain = data.get("ouranos_domain_name")
×
713
            if domain:
×
714
                facets["domain"] = domain.strip()
×
715
            else:
716
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
717
                logging.error(msg)
×
718
                raise NotImplementedError(msg)
×
719

720
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
721
        aws_keys = data.get("intake_esm_dataset_key")
×
722
        if aws_keys:
×
723
            facets["domain"] = aws_keys.split(".")[3]
×
724

725
        title = data.get("title")
×
726
        if title:
×
727
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
728
            if regridded_domain_found:
×
729
                facets["domain"] = regridded_domain_found.group()
×
730

731
        # The logic here is awful, but the information is bad to begin with.
732
        driving_model = ""
×
733
        driving_institution = ""
×
734

735
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
736
        if VALIDATION_ENABLED:
×
737
            if driving_institution_parts[0] in INSTITUTIONS:
×
738
                driving_institution = driving_institution_parts[0]
×
739
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
740
                driving_institution = "-".join(driving_institution_parts[:2])
×
741
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
742
                driving_institution = "-".join(driving_institution_parts[:3])
×
743
        else:
744
            logging.warning(
×
745
                "CORDEX Metadata validation checks require PyESSV. "
746
                "Driving institution cannot be determined."
747
            )
748
            driving_model = data["driving_model_id"]
×
749

750
        if data["driving_model_id"].startswith("GFDL"):
×
751
            driving_institution = "NOAA-GFDL"
×
752
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
753
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
754
            driving_institution = "MPI-M"
×
755
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
756
        elif data["driving_model_id"].startswith("HadGEM2"):
×
757
            driving_institution = "MOHC"
×
758
            driving_model = f"MOHC-{data['driving_model_id']}"
×
759
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
760
            driving_institution = "CNRM-CERFACS"
×
761
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
762

763
        elif VALIDATION_ENABLED and not driving_institution:
×
764
            raise DecoderError(
×
765
                "driving_institution (from driving_model_id: "
766
                f"`{data['driving_model_id']}`) is not valid."
767
            )
768

769
        facets["driving_institution"] = driving_institution.strip()
×
770
        if driving_model:
×
771
            facets["driving_model"] = driving_model.strip()
×
772
        else:
773
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
774

775
        facets["format"] = "netcdf"
×
776

777
        if data["institute_id"].strip() == "Our.":
×
778
            facets["institution"] = "Ouranos"
×
779
        else:
780
            facets["institution"] = data["institute_id"].strip()
×
781

782
        facets["processing_level"] = "raw"
×
783
        facets["source"] = data["model_id"]
×
784
        facets["type"] = "simulation"
×
785
        facets["variable"] = variable
×
786

787
        facets.update(cls._decode_version(data=data, file=file))
×
788
        facets.update(cls._decode_hour_of_day_info(file=file))
×
789

790
        try:
×
791
            facets["frequency"] = cls._decode_time_info(
×
792
                data=data, file=file, field="frequency"
793
            )
794
            facets["timedelta"] = cls._decode_time_info(
×
795
                term=facets["frequency"], field="timedelta"
796
            )
797
            facets["date_start"] = date_parser(date)
×
798
            facets["date_end"] = date_parser(date, end_of_period=True)
×
799
        except DecoderError:  # noqa: S110
×
800
            pass
×
801

802
        try:
×
803
            facets["experiment"] = data["experiment_id"].strip()
×
804
        except KeyError:
×
805
            facets["experiment"] = data["driving_experiment_name"].strip()
×
806

807
        try:
×
808
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
809
                facets["member"] = data.get(potential_member)
×
810
                if facets["member"] == "N/A":
×
811
                    raise KeyError()
×
812
                else:
813
                    break
×
814
            if facets["member"] is None:
×
815
                raise KeyError()
×
816
        except KeyError:
×
817
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
818

819
        return facets
×
820

821
    @classmethod
7✔
822
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
7✔
823
        facets = dict()
×
824
        try:
×
825
            variable, date, data = cls._from_dataset(file=file)
×
826
        except DecoderError:
×
827
            return facets
×
828

829
        facets["activity"] = "ISIMIP"
×
830
        facets["mip_era"] = data["project_id"]
×
831
        facets["date"] = date
×
832
        facets["domain"] = "global"
×
833
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
834
        facets["experiment"] = data["experiment_id"]
×
835
        facets["format"] = "netcdf"
×
836
        facets["impact_model"] = data["impact_model_id"]
×
837
        facets["institution"] = data["institute_id"]
×
838
        facets["member"] = data["driving_model_ensemble_member"]
×
839
        facets["modeling_realm"] = data["modeling_realm"]
×
840
        facets["social_forcing_id"] = data["social_forcing_id"]
×
841
        facets["source"] = data["model_id"]
×
842
        facets["type"] = "simulation"
×
843
        facets["variable"] = variable
×
844

845
        facets.update(cls._decode_version(data=data, file=file))
×
846
        facets.update(cls._decode_hour_of_day_info(file=file))
×
847

848
        try:
×
849
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
850
            facets["timedelta"] = cls._decode_time_info(
×
851
                term=facets["frequency"], field="timedelta"
852
            )
853
            facets["date_start"] = date_parser(date)
×
854
            facets["date_end"] = date_parser(date, end_of_period=True)
×
855
        except DecoderError:  # noqa: S110
×
856
            pass
×
857

858
        return facets
×
859

860
    @classmethod
7✔
861
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
7✔
862
        facets = dict()
×
863
        try:
×
864
            variable, date, data = cls._from_dataset(file=file)
×
865
        except DecoderError:
×
866
            return facets
×
867

868
        facets["experiment"] = data["scenario"]
×
869
        facets["activity"] = (
×
870
            "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
871
        )
872
        facets["institution"] = data["cmip6_institution_id"]
×
873
        facets["member"] = data["variant_label"]
×
874
        facets["processing_level"] = "biasadjusted"
×
875
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
876
        facets["bias_adjust_institution"] = "NASA"
×
877
        facets["mip_era"] = "CMIP6"
×
878
        facets["source"] = data["cmip6_source_id"]
×
879
        facets["type"] = "simulation"
×
880
        facets["variable"] = variable
×
881
        facets.update(cls._decode_version(data=data, file=file))
×
882
        facets.update(cls._decode_hour_of_day_info(file=file))
×
883

884
        try:
×
885
            facets["frequency"] = cls._decode_time_info(
×
886
                data=data, file=file, field="frequency"
887
            )
888
            facets["timedelta"] = cls._decode_time_info(
×
889
                term=facets["frequency"], field="timedelta"
890
            )
891
            facets["date_start"] = date_parser(date)
×
892
            facets["date_end"] = date_parser(date, end_of_period=True)
×
893
        except DecoderError:  # noqa: S110
×
894
            pass
×
895

896
        return facets
×
897

898
    @classmethod
7✔
899
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
7✔
900
        facets = dict()
×
901
        try:
×
902
            variable, date, data = cls._from_dataset(file=file)
×
903
        except DecoderError:
×
904
            return facets
×
905

906
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
907
        facets["processing_level"] = "biasadjusted"
×
908
        facets["version"] = "1.0.0"
×
909
        facets["domain"] = "NAM"
×
910
        for f in [
×
911
            "experiment",
912
            "activity",
913
            "institution",
914
            "member",
915
            "bias_adjust_institution",
916
            "mip_era",
917
            "source",
918
            "type",
919
        ]:
920
            facets[f] = data[f"cat:{f}"]
×
921
        facets["variable"] = variable
×
922
        # facets.update(cls._decode_version(data=data, file=file))
923
        facets.update(cls._decode_hour_of_day_info(file=file))
×
924

925
        try:
×
926
            facets["frequency"] = cls._decode_time_info(
×
927
                data=data, file=file, field="frequency"
928
            )
929
            facets["timedelta"] = cls._decode_time_info(
×
930
                term=facets["frequency"], field="timedelta"
931
            )
932
            facets["date_start"] = date_parser(date)
×
933
            facets["date_end"] = date_parser(date, end_of_period=True)
×
934
        except DecoderError:  # noqa: S110
×
935
            pass
×
936

937
        return facets
×
938

939
    @classmethod
7✔
940
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
7✔
941
        facets = dict()
×
942
        try:
×
943
            variable, date, data = cls._from_dataset(file=file)
×
944
        except DecoderError:
×
945
            return facets
×
946

947
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
948
        facets["processing_level"] = "biasadjusted"
×
949
        facets["version"] = "1.0.0"
×
950
        facets["domain"] = "NAM"
×
951
        for f in [
×
952
            "experiment",
953
            "activity",
954
            "institution",
955
            "member",
956
            "bias_adjust_institution",
957
            "mip_era",
958
            "source",
959
            "type",
960
        ]:
961
            facets[f] = data[f"cat:{f}"]
×
962
        facets["variable"] = variable
×
963
        # facets.update(cls._decode_version(data=data, file=file))
964
        facets.update(cls._decode_hour_of_day_info(file=file))
×
965

966
        try:
×
967
            facets["frequency"] = cls._decode_time_info(
×
968
                data=data, file=file, field="frequency"
969
            )
970
            facets["timedelta"] = cls._decode_time_info(
×
971
                term=facets["frequency"], field="timedelta"
972
            )
973
            facets["date_start"] = date_parser(date)
×
974
            facets["date_end"] = date_parser(date, end_of_period=True)
×
975
        except DecoderError:  # noqa: S110
×
976
            pass
×
977

978
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc