• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 21532860452

30 Jan 2026 10:28PM UTC coverage: 18.107%. First build
21532860452

Pull #326

github

web-flow
Merge faa65def3 into a27b83e7e
Pull Request #326: Suppport Python 3.14

8 of 40 new or added lines in 2 files covered. (20.0%)

1224 of 6760 relevant lines covered (18.11%)

1.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.5
/src/miranda/decode/_decoder.py
1
from __future__ import annotations
10✔
2
import importlib.util as ilu
10✔
3
import logging
10✔
4
import multiprocessing as mp
10✔
5
import os
10✔
6
import re
10✔
7
import warnings
10✔
8
from functools import partial
10✔
9
from os import PathLike
10✔
10
from pathlib import Path
10✔
11
from types import GeneratorType
10✔
12

13
import h5netcdf
10✔
14
import pandas as pd
10✔
15
import schema
10✔
16
import xarray as xr
10✔
17
import zarr
10✔
18
from pandas._libs.tslibs import NaTType  # noqa
10✔
19

20
from miranda.convert.utils import date_parser, find_version_hash  # noqa
10✔
21
from miranda.cv import VALIDATION_ENABLED
10✔
22
from miranda.units import check_time_frequency
10✔
23

24
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
10✔
25

26

27
HAS_NETCDF4 = bool(ilu.find_spec("netCDF4"))
10✔
28

29

30
if VALIDATION_ENABLED:
10✔
31
    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
10✔
32
    from miranda.validate import FACETS_SCHEMA  # noqa
10✔
33

34

35
logger = logging.getLogger("miranda.decode.decoder")
10✔
36

37
__all__ = [
10✔
38
    "Decoder",
39
    "guess_project",
40
]
41

42

43
def guess_project(file: os.PathLike | str) -> str:
10✔
44
    """
45
    Guess the name of the project
46

47
    Parameters
48
    ----------
49
    file : str or os.PathLike
50

51
    Returns
52
    -------
53
    str
54
    """
55
    file_name = Path(file).stem
×
56

57
    potential_names = file_name.split("_")
×
58
    if VALIDATION_ENABLED:
×
59
        for project, models in PROJECT_MODELS.items():
×
60
            if any([model in potential_names for model in models]):
×
61
                return project
×
62
        raise DecoderError(f"Unable to determine project from file name: '{file_name}'.")
×
63
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
64

65

66
class Decoder:
10✔
67
    project = None
10✔
68
    guess = False
10✔
69
    _file_facets = dict()
10✔
70

71
    def __init__(self, project: str | None):
10✔
72
        self.project = project
×
73

74
    @staticmethod
10✔
75
    def _decoder(
10✔
76
        d: dict,
77
        fail_early: bool,
78
        proj: str,
79
        guess: bool,
80
        lock: mp.Lock,
81
        file: str | Path,
82
    ) -> None:
83
        if proj is None:
×
84
            if guess:
×
85
                try:
×
86
                    proj = guess_project(file)
×
87
                except DecoderError:
×
88
                    print(f"Unable to determine 'activity': Signature for 'activity' must be set manually for file: {file}.")
×
89
                    if fail_early:
×
90
                        raise
×
91
            else:
92
                proj = "converted"
×
93

94
        decode_function_name = f"decode_{proj.lower().replace('-', '_')}"
×
95
        try:
×
96
            with lock:
×
97
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
98
                if fail_early:
×
99
                    if VALIDATION_ENABLED:
×
100
                        FACETS_SCHEMA.validate(_deciphered)
×
101
                    else:
102
                        print("Validation requires pyessv-archive source files. Skipping validation checks.")
×
103
                print(f"Deciphered the following from {Path(file).name}:\n{_deciphered.items()}")
×
104
                d[file] = _deciphered
×
105

106
        except (AttributeError, NotImplementedError):
×
107
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
108
            raise
×
109
        except schema.SchemaError as e:
×
110
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
111

112
    def decode(
10✔
113
        self,
114
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
115
        chunks: int | None = None,
116
        raise_error: bool = False,
117
    ) -> None:
118
        """
119
        Decode facets from file or list of files.
120

121
        Parameters
122
        ----------
123
        files : str or Path or list of str or Path or generator
124
            The files to decode.
125
        chunks : int, optional
126
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
127
        raise_error : bool
128
            Whether to raise an error if a file cannot be decoded.
129
        """
130
        if isinstance(files, (str, os.PathLike)):
×
131
            files = [files]
×
132

133
        if chunks is None and isinstance(files, list):
×
134
            if len(files) >= 10:
×
135
                chunk_size = 10
×
136
            elif 1 <= len(files) < 10:
×
137
                chunk_size = len(files)
×
138
            else:
139
                raise ValueError("No file entries found.")
×
140
        elif isinstance(files, GeneratorType):
×
141
            chunk_size = 10
×
142
        else:
143
            chunk_size = chunks
×
144

145
        if self.project is None:
×
146
            warnings.warn("The decoder 'project' is not set; Decoding step will be much slower.", stacklevel=2)
×
147
        else:
148
            msg = f"Deciphering metadata with project = '{self.project}'"
×
149
            logger.info(msg)
×
150

151
        with mp.Manager() as manager:
×
152
            _file_facets = manager.dict()
×
153
            lock = manager.Lock()
×
154
            func = partial(self._decoder, _file_facets, raise_error, self.project, self.guess, lock)
×
155

156
            with mp.Pool() as pool:
×
157
                pool.imap(func, files, chunksize=chunk_size)
×
158
                pool.close()
×
159
                pool.join()
×
160

161
            self._file_facets.update(_file_facets)
×
162

163
    def facets_table(self):
10✔
164
        raise NotImplementedError()
×
165

166
    def file_facets(self) -> dict[os.PathLike, dict]:
10✔
167
        return self._file_facets
×
168

169
    @classmethod
10✔
170
    def _from_dataset(cls, file: Path | str) -> tuple[str, str, dict]:
10✔
NEW
171
        file_path = Path(file)
×
172

173
        try:
×
NEW
174
            variable_name = str(cls._decode_primary_variable(file_path))
×
175
        except DecoderError:
×
NEW
176
            msg = f"Unable to open dataset: {file_path.name}"
×
177
            logger.error(msg)
×
178
            raise
×
179

NEW
180
        datetimes = file_path.name.split("_")[-1]
×
NEW
181
        data = {}
×
182

NEW
183
        if file_path.is_file() and file_path.suffix in [".nc", ".nc4"]:
×
NEW
184
            if HAS_NETCDF4:
×
NEW
185
                import netCDF4
×
186

NEW
187
                with netCDF4.Dataset(file, mode="r") as ds:
×
188
                    for k in ds.ncattrs():
×
189
                        data[k] = getattr(ds, k)
×
190
            else:
191
                with h5netcdf.File(file, mode="r") as ds:
×
192
                    for k in ds.attrs:
×
193
                        data[k] = ds.attrs[k]
×
NEW
194
        elif file_path.is_dir() and file_path.suffix == ".zarr":
×
NEW
195
            with zarr.open_group(store=file_path, mode="r") as ds:
×
NEW
196
                data.update(ds.attrs.asdict())
×
197
        else:
NEW
198
            raise DecoderError(f"Unable to read dataset: `{file_path.name}`.")
×
199

200
        return variable_name, datetimes, data
×
201

202
    @staticmethod
10✔
203
    def _decode_primary_variable(file: Path) -> str | None:
10✔
204
        """
205
        Attempts to find the primary variable of a netCDF
206

207
        Parameters
208
        ----------
209
        file: Path
210

211
        Returns
212
        -------
213
        str
214
        """
NEW
215
        dimsvar_dict = {}
×
216
        coords = (
×
217
            "height",
218
            "lat",
219
            "latitude",
220
            "lev",
221
            "level",
222
            "lon",
223
            "longitude",
224
            "rlat",
225
            "rlon",
226
            "rotated_pole",
227
            "time",
228
        )
229

230
        try:
×
231
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
NEW
232
                if HAS_NETCDF4:
×
NEW
233
                    import netCDF4
×
234

NEW
235
                    with netCDF4.Dataset(file, mode="r") as ds:
×
236
                        for var_name, var_attrs in ds.variables.items():
×
237
                            dimsvar_dict[var_name] = {k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()}
×
238
                else:
239
                    with h5netcdf.File(file, mode="r") as ds:
×
240
                        for var_name, var_attrs in ds.variables.items():
×
241
                            dimsvar_dict[var_name] = {k: var_attrs.attrs[k] for k in var_attrs.attrs}
×
242
                for k in dimsvar_dict.keys():
×
243
                    if not str(k).startswith(coords) and k in file.stem:
×
244
                        return str(k)
×
245

246
            elif file.is_dir() and file.suffix == ".zarr":
×
NEW
247
                with zarr.open_group(file, mode="r") as ds:
×
248
                    for k in ds.array_keys():
×
249
                        if not str(k).startswith(coords) and k in file.stem:
×
250
                            return str(k)
×
251
            else:
252
                msg = "File format is not supported."
×
253
                raise NotImplementedError(msg)
×
254
        except ValueError as err:
×
255
            msg = f"Unable to open dataset: {file.name}"
×
256
            raise DecoderError(msg) from err
×
257

258
    @staticmethod
10✔
259
    def _decode_hour_of_day_info(
10✔
260
        file: PathLike | str,
261
    ) -> dict:
262
        """
263
        Decode hour of day information.
264

265
        Parameters
266
        ----------
267
        file : Path or str
268

269
        Returns
270
        -------
271
        dict
272
        """
NEW
273
        file_path = Path(file)
×
274

NEW
275
        if file_path.is_file() and file_path.suffix in [".nc", ".nc4"]:
×
NEW
276
            if HAS_NETCDF4:
×
NEW
277
                import netCDF4
×
278

NEW
279
                with netCDF4.Dataset(file, mode="r") as ds:
×
280
                    if "time" in ds.variables.keys():
×
NEW
281
                        hour = netCDF4.num2date(ds["time"][0], ds["time"].units, ds["time"].calendar).hour
×
282
                    else:
283
                        hour = None
×
284
                return dict(hour_of_day=hour)
×
285
            else:
286
                warnings.warn("This is not currently implemented for h5netcdf. Install netCDF4.", stacklevel=2)
×
287
                return dict()
×
288

NEW
289
        elif file_path.is_dir() and file_path.suffix == ".zarr":
×
290
            warnings.warn("This is not currently implemented for zarr. Install netCDF4.", stacklevel=2)
×
291
            return dict()
×
292

293
        else:
294
            raise NotImplementedError()
×
295

296
    @staticmethod
10✔
297
    def _decode_time_info(  # noqa: C901
10✔
298
        file: PathLike | str | list[str] | None = None,
299
        data: dict | None = None,
300
        term: str | None = None,
301
        *,
302
        field: str | None = None,
303
    ) -> str | pd.Timedelta | NaTType:
304
        """
305
        Decode time information.
306

307
        Parameters
308
        ----------
309
        file : os.PathLike or str, optional
310
        data : dict, optional
311
        term : str
312
        field : {"timedelta", "frequency"}
313

314
        Returns
315
        -------
316
        str or pd.Timedelta or NaTType
317
        """
318
        if not file and not data and not term:
×
319
            raise ValueError("Nothing passed to parse time info from.")
×
320

321
        if field == "frequency":
×
322
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
323
        elif field == "timedelta":
×
324
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
325
        else:
326
            raise NotImplementedError()
×
327

328
        if term:
×
329
            if term in ["fx", "fixed"]:
×
330
                if field == "timedelta":
×
331
                    return pd.NaT
×
332
                return "fx"
×
333
            return pd.to_timedelta(time_dictionary[term])
×
334

335
        if data and not file:
×
336
            potential_time = data.get("frequency")
×
337
            if not potential_time:
×
338
                if hasattr(data, "time"):
×
339
                    time_units = data["time"].units
×
340
                    potential_time = time_units.split()[0]
×
341
                else:
342
                    msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
343

344
                    logger.warning(msg)
×
345
                    potential_time = "fx"
×
346
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
347
                msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
348
                logger.warning(msg)
×
349
                if field == "frequency":
×
350
                    return "fx"
×
351
                if field == "timedelta":
×
352
                    return pd.NaT
×
353
                raise ValueError()
×
354

355
            if field == "timedelta":
×
356
                if potential_time in ["fx", "fixed"]:
×
357
                    return pd.NaT
×
358
                return pd.to_timedelta(time_dictionary[potential_time])
×
359
            return time_dictionary[potential_time]
×
360

361
        if file and not data:
×
362
            for delimiter in ["_", "."]:
×
363
                file_parts = Path(file).stem.split(delimiter)
×
364
                potential_times = [segment for segment in file_parts if segment in time_dictionary.keys()]
×
365
                if potential_times:
×
366
                    if potential_times[0] in ["fx", "fixed"]:
×
367
                        if field == "frequency":
×
368
                            return "fx"
×
369
                        if field == "timedelta":
×
370
                            return pd.NaT
×
371
                        raise ValueError(f"Field `{field}` not supported.")
×
372
                    if field == "timedelta":
×
373
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
374
                    return time_dictionary[potential_times[0]]
×
375

376
        if file and data:
×
377
            for delimiter in ["_", "."]:
×
378
                file_parts = Path(file).stem.split(delimiter)
×
379
                potential_times = [segment for segment in file_parts if segment in time_dictionary.keys()]
×
380
                potential_time = data.get("frequency", "")
×
381
                if potential_time == "":
×
382
                    if hasattr(data, "time"):
×
383
                        time_units = data["time"].units
×
384
                        potential_time = time_units.split()[0]
×
385
                    else:
386
                        msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
387

388
                        logger.warning(msg)
×
389
                        potential_time = "fx"
×
390
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
391
                    msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
392

393
                    logger.warning(msg)
×
394
                    if "fx" in file_parts or "fixed" in file_parts:
×
395
                        if field == "frequency":
×
396
                            return "fx"
×
397
                        if field == "timedelta":
×
398
                            return pd.NaT
×
399
                        raise ValueError(f"Field `{field}` not supported.")
×
400

401
                if potential_time in potential_times:
×
402
                    return time_dictionary[potential_time]
×
403
                elif potential_times:
×
404
                    break
×
405

406
            msg = (
×
407
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
408
                "Performing more rigorous frequency checks."
409
            )
410
            logger.warning(msg)
×
411
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
412
                engine = "h5netcdf"
×
413
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
414
                engine = "zarr"
×
415
            else:
416
                raise DecoderError(f"File is not valid netcdf or zarr: {Path(file).name}")
×
417

418
            _ds = xr.open_dataset(
×
419
                file,
420
                engine=engine,
421
                drop_variables="time_bnds",
422
            )
423
            if not hasattr(_ds, "time"):
×
424
                logger.warning("Dataset does not contain time array. Assuming fixed variable.")
×
425
                if field == "frequency":
×
426
                    return "fx"
×
427
                if field == "timedelta":
×
428
                    return pd.NaT
×
429
                raise ValueError(f"Field `{field}` not supported.")
×
430
            else:
431
                _, found_freq = check_time_frequency(_ds.time)
×
432

433
            if found_freq in potential_times:
×
434
                msg = (
×
435
                    "Time frequency found in dataset on analysis was found in filename. "
436
                    f"Metadata for `{Path(file).name} is probably incorrect. "
437
                    f"Basing fields on `{found_freq}`."
438
                )
439
                logger.warning(msg)
×
440
                return time_dictionary[found_freq]
×
441
            elif found_freq in ["month", "mon"]:
×
442
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
443
                    if f in potential_times:
×
444
                        msg = f"Month-like time frequency found in dataset on analysis was found in filename. Basing fields on `{f}`."
×
445
                        logger.warning(msg)
×
446
                        return time_dictionary[f]
×
447
            else:
448
                msg = f"Time frequency found in dataset on analysis was not found in filename. Basing fields on `{found_freq}`."
×
449
                logger.warning(msg)
×
450
                return time_dictionary[found_freq]
×
451
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
452

453
    @staticmethod
10✔
454
    def _decode_version(file: PathLike | str, data: dict) -> dict:
10✔
455
        """
456
        Decode version information.
457

458
        Parameters
459
        ----------
460
        file : os.PathLike or str
461
        data : dict
462

463
        Returns
464
        -------
465
        dict
466
        """
467
        version_info = dict()
×
468
        try:
×
469
            version_info["version"] = data["version"]
×
470
        except KeyError:
×
471
            possible_version = Path(file).parent
×
472
            if re.match(r"^[vV]\d+", possible_version.name):
×
473
                version_info["version"] = possible_version.name
×
474
            else:
475
                possible_version_signature = possible_version.glob(f"{Path(file).stem}.v*")
×
476
                for sig in possible_version_signature:
×
477
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
478
                    if found_version:
×
479
                        version_info["version"] = found_version.group()
×
480
                        version_info["sha256sum"] = sig.open().read()
×
481
                        break
×
482
                else:
483
                    version_info["version"] = "vNotFound"
×
484
        return version_info
×
485

486
    @classmethod
10✔
487
    def decode_converted(cls, file: PathLike | str) -> dict:
10✔
488
        """
489
        Decode converted data.
490

491
        Parameters
492
        ----------
493
        file : os.PathLike or str
494

495
        Returns
496
        -------
497
        dict
498
        """
499
        facets = dict()
×
500
        try:
×
501
            variable, date, data = cls._from_dataset(file=file)
×
502
        except DecoderError:
×
503
            return facets
×
504

505
        facets.update(data)
×
506
        del facets["history"]
×
507

508
        facets["date"] = date
×
509

510
        file_format = data.get("output_format")
×
511
        if file_format:
×
512
            facets["format"] = file_format
×
513
        elif "format" in data:
×
514
            facets["format"] = data["format"]
×
515
        elif Path(file).suffix in [".nc", ".nc4"]:
×
516
            facets["format"] = "nc"
×
517
        elif Path(file).suffix in [".zarr"]:
×
518
            facets["format"] = "zarr"
×
519
        facets["variable"] = variable
×
520

521
        facets.update(cls._decode_version(data=data, file=file))
×
522
        facets.update(cls._decode_hour_of_day_info(file=file))
×
523

524
        try:
×
525
            if "frequency" not in facets:
×
526
                facets["timedelta"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
527
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
528
            facets["date_start"] = date_parser(date)
×
529
            facets["date_end"] = date_parser(date, end_of_period=True)
×
530
        except DecoderError:  # noqa: S110
×
531
            pass
×
532

533
        return facets
×
534

535
    @staticmethod
10✔
536
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
10✔
537
        raise NotImplementedError()
×
538

539
    @staticmethod
10✔
540
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
10✔
541
        raise NotImplementedError()
×
542

543
    @staticmethod
10✔
544
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
10✔
545
        raise NotImplementedError()
×
546

547
    @classmethod
10✔
548
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
10✔
549
        if "Derived" in Path(file).parents:
×
550
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
551

552
        facets = dict()
×
553
        try:
×
554
            variable, date, data = cls._from_dataset(file=file)
×
555
        except DecoderError:
×
556
            return facets
×
557

558
        facets["activity"] = data["activity_id"]
×
559
        facets["mip_era"] = data["project_id"]
×
560
        facets["bias_adjust_institution"] = "PCIC"
×
561
        facets["date"] = date
×
562
        facets["domain"] = data["domain"]
×
563
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
564
        facets["format"] = "netcdf"
×
565
        facets["institution"] = data["GCM__institution_id"]
×
566
        facets["member"] = (
×
567
            f"r{data['GCM__realization_index']}i{data['GCM__initialization_index']}p{data['GCM__physics_index']}f{data['GCM__forcing_index']}"
568
        )
569
        facets["processing_level"] = "biasadjusted"
×
570
        facets["bias_adjust_project"] = "CanDCS-U6"
×
571
        facets["source"] = data["GCM__source_id"]
×
572
        facets["type"] = "simulation"
×
573
        facets["variable"] = variable
×
574

575
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
576
        if facets["version"] is None:
×
577
            facets.update(find_version_hash(file=file))
×
578

579
        facets.update(cls._decode_hour_of_day_info(file=file))
×
580

581
        try:
×
582
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
583
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
584
            facets["date_start"] = date_parser(date)
×
585
            facets["date_end"] = date_parser(date, end_of_period=True)
×
586
        except DecoderError:  # noqa: S110
×
587
            pass
×
588

589
        return facets
×
590

591
    @classmethod
10✔
592
    def decode_cmip6(cls, file: PathLike | str) -> dict:
10✔
593
        facets = dict()
×
594
        try:
×
595
            variable, date, data = cls._from_dataset(file=file)
×
596
        except DecoderError:
×
597
            return facets
×
598

599
        facets["activity"] = data["activity_id"]
×
600
        facets["date"] = date
×
601
        facets["domain"] = "global"
×
602
        facets["experiment"] = data["experiment_id"]
×
603
        facets["format"] = "netcdf"
×
604
        facets["grid_label"] = data["grid_label"]
×
605
        facets["institution"] = data["institution_id"]
×
606
        facets["member"] = data["variant_label"]
×
607
        facets["modeling_realm"] = data["realm"]
×
608
        facets["processing_level"] = "raw"
×
609
        facets["mip_era"] = data["mip_era"]
×
610
        facets["source"] = data["source_id"]
×
611
        facets["type"] = "simulation"
×
612
        facets["variable"] = variable
×
613
        facets.update(cls._decode_version(data=data, file=file))
×
614
        facets.update(cls._decode_hour_of_day_info(file=file))
×
615

616
        try:
×
617
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
618
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
619
            facets["date_start"] = date_parser(date)
×
620
            facets["date_end"] = date_parser(date, end_of_period=True)
×
621
        except DecoderError:  # noqa: S110
×
622
            pass
×
623

624
        return facets
×
625

626
    @classmethod
10✔
627
    def decode_cmip5(cls, file: PathLike | str) -> dict:
10✔
628
        facets = dict()
×
629
        try:
×
630
            variable, date, data = cls._from_dataset(file=file)
×
631
        except DecoderError:
×
632
            return facets
×
633

634
        facets["activity"] = "CMIP"
×
635
        facets["date"] = date
×
636
        facets["domain"] = "global"
×
637
        facets["experiment"] = data["experiment_id"]
×
638
        facets["format"] = "netcdf"
×
639
        facets["institution"] = data["institute_id"]
×
640
        facets["member"] = data["parent_experiment_rip"]
×
641
        facets["modeling_realm"] = data["modeling_realm"]
×
642
        facets["processing_level"] = "raw"
×
643
        facets["mip_era"] = data["project_id"]
×
644
        facets["source"] = data["model_id"]
×
645
        facets["type"] = "simulation"
×
646
        facets["variable"] = variable
×
647
        facets.update(cls._decode_version(data=data, file=file))
×
648
        facets.update(cls._decode_hour_of_day_info(file=file))
×
649

650
        try:
×
651
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
652
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
653
            facets["date_start"] = date_parser(date)
×
654
            facets["date_end"] = date_parser(date, end_of_period=True)
×
655
        except DecoderError:  # noqa: S110
×
656
            pass
×
657

658
        return facets
×
659

660
    @classmethod
10✔
661
    def decode_cordex(cls, file: PathLike | str) -> dict:
10✔
662
        facets = dict()
×
663
        try:
×
664
            variable, date, data = cls._from_dataset(file=file)
×
665
        except DecoderError:
×
666
            return dict()
×
667

668
        # FIXME: What to do about our internal data that breaks all established conventions?
669
        facets["activity"] = "CORDEX"
×
670

671
        if data.get("project_id") == "" or data.get("project_id") is None:
×
672
            facets["mip_era"] = "internal"
×
673
        elif data.get("project_id") == "CORDEX":
×
674
            facets["mip_era"] = "CMIP5"
×
675

676
        if date == "r0i0p0":
×
677
            facets["date"] = "fx"
×
678
        else:
679
            facets["date"] = date
×
680

681
        domain = data.get("CORDEX_domain")
×
682
        if domain:
×
683
            facets["domain"] = domain.strip()
×
684
        else:
685
            domain = data.get("ouranos_domain_name")
×
686
            if domain:
×
687
                facets["domain"] = domain.strip()
×
688
            else:
689
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
690
                logger.error(msg)
×
691
                raise NotImplementedError(msg)
×
692

693
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
694
        aws_keys = data.get("intake_esm_dataset_key")
×
695
        if aws_keys:
×
696
            facets["domain"] = aws_keys.split(".")[3]
×
697

698
        title = data.get("title")
×
699
        if title:
×
700
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
701
            if regridded_domain_found:
×
702
                facets["domain"] = regridded_domain_found.group()
×
703

704
        # The logic here is awful, but the information is bad to begin with.
705
        driving_model = ""
×
706
        driving_institution = ""
×
707

708
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
709
        if VALIDATION_ENABLED:
×
710
            if driving_institution_parts[0] in INSTITUTIONS:
×
711
                driving_institution = driving_institution_parts[0]
×
712
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
713
                driving_institution = "-".join(driving_institution_parts[:2])
×
714
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
715
                driving_institution = "-".join(driving_institution_parts[:3])
×
716
        else:
717
            logger.warning("CORDEX Metadata validation checks require PyESSV. Driving institution cannot be determined.")
×
718
            driving_model = data["driving_model_id"]
×
719

720
        if data["driving_model_id"].startswith("GFDL"):
×
721
            driving_institution = "NOAA-GFDL"
×
722
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
723
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
724
            driving_institution = "MPI-M"
×
725
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
726
        elif data["driving_model_id"].startswith("HadGEM2"):
×
727
            driving_institution = "MOHC"
×
728
            driving_model = f"MOHC-{data['driving_model_id']}"
×
729
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
730
            driving_institution = "CNRM-CERFACS"
×
731
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
732

733
        elif VALIDATION_ENABLED and not driving_institution:
×
734
            raise DecoderError(f"driving_institution (from driving_model_id: `{data['driving_model_id']}`) is not valid.")
×
735

736
        facets["driving_institution"] = driving_institution.strip()
×
737
        if driving_model:
×
738
            facets["driving_model"] = driving_model.strip()
×
739
        else:
740
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
741

742
        facets["format"] = "netcdf"
×
743

744
        if data["institute_id"].strip() == "Our.":
×
745
            facets["institution"] = "Ouranos"
×
746
        else:
747
            facets["institution"] = data["institute_id"].strip()
×
748

749
        facets["processing_level"] = "raw"
×
750
        facets["source"] = data["model_id"]
×
751
        facets["type"] = "simulation"
×
752
        facets["variable"] = variable
×
753

754
        facets.update(cls._decode_version(data=data, file=file))
×
755
        facets.update(cls._decode_hour_of_day_info(file=file))
×
756

757
        try:
×
758
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
759
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
760
            facets["date_start"] = date_parser(date)
×
761
            facets["date_end"] = date_parser(date, end_of_period=True)
×
762
        except DecoderError:  # noqa: S110
×
763
            pass
×
764

765
        try:
×
766
            facets["experiment"] = data["experiment_id"].strip()
×
767
        except KeyError:
×
768
            facets["experiment"] = data["driving_experiment_name"].strip()
×
769

770
        try:
×
771
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
772
                facets["member"] = data.get(potential_member)
×
773
                if facets["member"] == "N/A":
×
774
                    raise KeyError()
×
775
                else:
776
                    break
×
777
            if facets["member"] is None:
×
778
                raise KeyError()
×
779
        except KeyError:
×
780
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
781

782
        return facets
×
783

784
    @classmethod
10✔
785
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
10✔
786
        facets = dict()
×
787
        try:
×
788
            variable, date, data = cls._from_dataset(file=file)
×
789
        except DecoderError:
×
790
            return facets
×
791

792
        facets["activity"] = "ISIMIP"
×
793
        facets["mip_era"] = data["project_id"]
×
794
        facets["date"] = date
×
795
        facets["domain"] = "global"
×
796
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
797
        facets["experiment"] = data["experiment_id"]
×
798
        facets["format"] = "netcdf"
×
799
        facets["impact_model"] = data["impact_model_id"]
×
800
        facets["institution"] = data["institute_id"]
×
801
        facets["member"] = data["driving_model_ensemble_member"]
×
802
        facets["modeling_realm"] = data["modeling_realm"]
×
803
        facets["social_forcing_id"] = data["social_forcing_id"]
×
804
        facets["source"] = data["model_id"]
×
805
        facets["type"] = "simulation"
×
806
        facets["variable"] = variable
×
807

808
        facets.update(cls._decode_version(data=data, file=file))
×
809
        facets.update(cls._decode_hour_of_day_info(file=file))
×
810

811
        try:
×
812
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
813
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
814
            facets["date_start"] = date_parser(date)
×
815
            facets["date_end"] = date_parser(date, end_of_period=True)
×
816
        except DecoderError:  # noqa: S110
×
817
            pass
×
818

819
        return facets
×
820

821
    @classmethod
10✔
822
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
10✔
823
        facets = dict()
×
824
        try:
×
825
            variable, date, data = cls._from_dataset(file=file)
×
826
        except DecoderError:
×
827
            return facets
×
828

829
        facets["experiment"] = data["scenario"]
×
830
        facets["activity"] = "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
×
831
        facets["institution"] = data["cmip6_institution_id"]
×
832
        facets["member"] = data["variant_label"]
×
833
        facets["processing_level"] = "biasadjusted"
×
834
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
835
        facets["bias_adjust_institution"] = "NASA"
×
836
        facets["mip_era"] = "CMIP6"
×
837
        facets["source"] = data["cmip6_source_id"]
×
838
        facets["type"] = "simulation"
×
839
        facets["variable"] = variable
×
840
        facets.update(cls._decode_version(data=data, file=file))
×
841
        facets.update(cls._decode_hour_of_day_info(file=file))
×
842

843
        try:
×
844
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
845
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
846
            facets["date_start"] = date_parser(date)
×
847
            facets["date_end"] = date_parser(date, end_of_period=True)
×
848
        except DecoderError:  # noqa: S110
×
849
            pass
×
850

851
        return facets
×
852

853
    @classmethod
10✔
854
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
10✔
855
        facets = dict()
×
856
        try:
×
857
            variable, date, data = cls._from_dataset(file=file)
×
858
        except DecoderError:
×
859
            return facets
×
860

861
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
862
        facets["processing_level"] = "biasadjusted"
×
863
        facets["version"] = "1.0.0"
×
864
        facets["domain"] = "NAM"
×
865
        for f in [
×
866
            "experiment",
867
            "activity",
868
            "institution",
869
            "member",
870
            "bias_adjust_institution",
871
            "mip_era",
872
            "source",
873
            "type",
874
        ]:
875
            facets[f] = data[f"cat:{f}"]
×
876
        facets["variable"] = variable
×
877
        # facets.update(cls._decode_version(data=data, file=file))
878
        facets.update(cls._decode_hour_of_day_info(file=file))
×
879

880
        try:
×
881
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
882
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
883
            facets["date_start"] = date_parser(date)
×
884
            facets["date_end"] = date_parser(date, end_of_period=True)
×
885
        except DecoderError:  # noqa: S110
×
886
            pass
×
887

888
        return facets
×
889

890
    @classmethod
10✔
891
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
10✔
892
        facets = dict()
×
893
        try:
×
894
            variable, date, data = cls._from_dataset(file=file)
×
895
        except DecoderError:
×
896
            return facets
×
897

898
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
899
        facets["processing_level"] = "biasadjusted"
×
900
        facets["version"] = "1.0.0"
×
901
        facets["domain"] = "NAM"
×
902
        for f in [
×
903
            "experiment",
904
            "activity",
905
            "institution",
906
            "member",
907
            "bias_adjust_institution",
908
            "mip_era",
909
            "source",
910
            "type",
911
        ]:
912
            facets[f] = data[f"cat:{f}"]
×
913
        facets["variable"] = variable
×
914
        # facets.update(cls._decode_version(data=data, file=file))
915
        facets.update(cls._decode_hour_of_day_info(file=file))
×
916

917
        try:
×
918
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
919
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
920
            facets["date_start"] = date_parser(date)
×
921
            facets["date_end"] = date_parser(date, end_of_period=True)
×
922
        except DecoderError:  # noqa: S110
×
923
            pass
×
924

925
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc