• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 19180774301

07 Nov 2025 08:45PM UTC coverage: 18.273%. First build
19180774301

Pull #292

github

web-flow
Merge 7a36937e2 into be9cff2f6
Pull Request #292: [pre-commit.ci] pre-commit autoupdate

8 of 57 new or added lines in 2 files covered. (14.04%)

1225 of 6704 relevant lines covered (18.27%)

1.27 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.62
/src/miranda/decode/_decoder.py
1
from __future__ import annotations
7✔
2
import logging
7✔
3
import multiprocessing as mp
7✔
4
import os
7✔
5
import re
7✔
6
import warnings
7✔
7
from functools import partial
7✔
8
from os import PathLike
7✔
9
from pathlib import Path
7✔
10
from types import GeneratorType
7✔
11

12
import h5netcdf
7✔
13
import pandas as pd
7✔
14
import schema
7✔
15
import xarray as xr
7✔
16
import zarr
7✔
17
from pandas._libs.tslibs import NaTType  # noqa
7✔
18

19
from miranda.convert.utils import date_parser, find_version_hash  # noqa
7✔
20
from miranda.cv import VALIDATION_ENABLED
7✔
21
from miranda.units import check_time_frequency
7✔
22

23
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
7✔
24

25

26
nc = None
7✔
27
try:
7✔
28
    import netCDF4 as nc  # noqa: F401,N813
7✔
NEW
29
except ImportError:  # noqa: S110
×
NEW
30
    pass
×
31

32
if VALIDATION_ENABLED:
7✔
33
    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
7✔
34
    from miranda.validate import FACETS_SCHEMA  # noqa
7✔
35

36

37
logger = logging.getLogger("miranda.decode.decoder")
7✔
38

39
__all__ = [
7✔
40
    "Decoder",
41
    "guess_project",
42
]
43

44

45
def guess_project(file: os.PathLike | str) -> str:
7✔
46
    """
47
    Guess the name of the project
48

49
    Parameters
50
    ----------
51
    file : str or os.PathLike
52

53
    Returns
54
    -------
55
    str
56
    """
57
    file_name = Path(file).stem
×
58

59
    potential_names = file_name.split("_")
×
60
    if VALIDATION_ENABLED:
×
61
        for project, models in PROJECT_MODELS.items():
×
62
            if any([model in potential_names for model in models]):
×
63
                return project
×
64
        raise DecoderError(f"Unable to determine project from file name: '{file_name}'.")
×
65
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
66

67

68
class Decoder:
7✔
69
    project = None
7✔
70
    guess = False
7✔
71
    _file_facets = dict()
7✔
72

73
    def __init__(self, project: str | None):
7✔
74
        self.project = project
×
75

76
    @staticmethod
7✔
77
    def _decoder(
7✔
78
        d: dict,
79
        fail_early: bool,
80
        proj: str,
81
        guess: bool,
82
        lock: mp.Lock,
83
        file: str | Path,
84
    ) -> None:
85
        if proj is None:
×
86
            if guess:
×
87
                try:
×
88
                    proj = guess_project(file)
×
89
                except DecoderError:
×
90
                    print(f"Unable to determine 'activity': Signature for 'activity' must be set manually for file: {file}.")
×
91
                    if fail_early:
×
92
                        raise
×
93
            else:
94
                proj = "converted"
×
95

96
        decode_function_name = f"decode_{proj.lower().replace('-', '_')}"
×
97
        try:
×
98
            with lock:
×
99
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
100
                if fail_early:
×
101
                    if VALIDATION_ENABLED:
×
102
                        FACETS_SCHEMA.validate(_deciphered)
×
103
                    else:
104
                        print("Validation requires pyessv-archive source files. Skipping validation checks.")
×
105
                print(f"Deciphered the following from {Path(file).name}:\n{_deciphered.items()}")
×
106
                d[file] = _deciphered
×
107

108
        except (AttributeError, NotImplementedError):
×
109
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
110
            raise
×
111
        except schema.SchemaError as e:
×
112
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
113

114
    def decode(
7✔
115
        self,
116
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
117
        chunks: int | None = None,
118
        raise_error: bool = False,
119
    ) -> None:
120
        """
121
        Decode facets from file or list of files.
122

123
        Parameters
124
        ----------
125
        files : str or Path or list of str or Path or generator
126
            The files to decode.
127
        chunks : int, optional
128
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
129
        raise_error : bool
130
            Whether to raise an error if a file cannot be decoded.
131
        """
132
        if isinstance(files, (str, os.PathLike)):
×
133
            files = [files]
×
134

135
        if chunks is None and isinstance(files, list):
×
136
            if len(files) >= 10:
×
137
                chunk_size = 10
×
138
            elif 1 <= len(files) < 10:
×
139
                chunk_size = len(files)
×
140
            else:
141
                raise ValueError("No file entries found.")
×
142
        elif isinstance(files, GeneratorType):
×
143
            chunk_size = 10
×
144
        else:
145
            chunk_size = chunks
×
146

147
        if self.project is None:
×
148
            warnings.warn("The decoder 'project' is not set; Decoding step will be much slower.", stacklevel=2)
×
149
        else:
150
            msg = f"Deciphering metadata with project = '{self.project}'"
×
151
            logger.info(msg)
×
152

153
        with mp.Manager() as manager:
×
154
            _file_facets = manager.dict()
×
155
            lock = manager.Lock()
×
156
            func = partial(self._decoder, _file_facets, raise_error, self.project, self.guess, lock)
×
157

158
            with mp.Pool() as pool:
×
159
                pool.imap(func, files, chunksize=chunk_size)
×
160
                pool.close()
×
161
                pool.join()
×
162

163
            self._file_facets.update(_file_facets)
×
164

165
    def facets_table(self):
7✔
166
        raise NotImplementedError()
×
167

168
    def file_facets(self) -> dict[os.PathLike, dict]:
7✔
169
        return self._file_facets
×
170

171
    @classmethod
7✔
172
    def _from_dataset(cls, file: Path | str) -> (str, str, dict):
7✔
173
        file_name = Path(file).stem
×
174

175
        try:
×
176
            variable_name = cls._decode_primary_variable(file)
×
177
        except DecoderError:
×
178
            msg = f"Unable to open dataset: {file.name}"
×
179
            logger.error(msg)
×
180
            raise
×
181

182
        datetimes = file_name.split("_")[-1]
×
183

184
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
NEW
185
            if nc is not None:
×
NEW
186
                with nc.Dataset(file, mode="r") as ds:
×
NEW
187
                    data = dict()
×
NEW
188
                    for k in ds.ncattrs():
×
NEW
189
                        data[k] = getattr(ds, k)
×
190
            else:
NEW
191
                with h5netcdf.File(file, mode="r") as ds:
×
NEW
192
                    data = dict()
×
NEW
193
                    for k in ds.attrs:
×
NEW
194
                        data[k] = ds.attrs[k]
×
195
        elif file.is_dir() and file.suffix == ".zarr":
×
196
            with zarr.open(file, mode="r") as ds:
×
197
                data = ds.attrs.asdict()
×
198
        else:
199
            raise DecoderError(f"Unable to read dataset: `{file.name}`.")
×
200

201
        return variable_name, datetimes, data
×
202

203
    @staticmethod
7✔
204
    def _decode_primary_variable(file: Path) -> str:
7✔
205
        """
206
        Attempts to find the primary variable of a netCDF
207

208
        Parameters
209
        ----------
210
        file: Path
211

212
        Returns
213
        -------
214
        str
215
        """
216
        dimsvar_dict = dict()
×
217
        coords = (
×
218
            "height",
219
            "lat",
220
            "latitude",
221
            "lev",
222
            "level",
223
            "lon",
224
            "longitude",
225
            "rlat",
226
            "rlon",
227
            "rotated_pole",
228
            "time",
229
        )
230
        try:
×
231
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
NEW
232
                if nc is not None:
×
NEW
233
                    with nc.Dataset(file, mode="r") as ds:
×
NEW
234
                        for var_name, var_attrs in ds.variables.items():
×
NEW
235
                            dimsvar_dict[var_name] = {k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()}
×
236
                else:
NEW
237
                    with h5netcdf.File(file, mode="r") as ds:
×
NEW
238
                        for var_name, var_attrs in ds.variables.items():
×
NEW
239
                            dimsvar_dict[var_name] = {k: var_attrs.attrs[k] for k in var_attrs.attrs}
×
240
                for k in dimsvar_dict.keys():
×
241
                    if not str(k).startswith(coords) and k in file.stem:
×
242
                        return str(k)
×
243

244
            elif file.is_dir() and file.suffix == ".zarr":
×
245
                with zarr.open(str(file), mode="r") as ds:
×
246
                    for k in ds.array_keys():
×
247
                        if not str(k).startswith(coords) and k in file.stem:
×
248
                            return str(k)
×
249
            else:
250
                msg = "File format is not supported."
×
251
                raise NotImplementedError(msg)
×
252
        except ValueError as err:
×
253
            msg = f"Unable to open dataset: {file.name}"
×
254
            raise DecoderError(msg) from err
×
255

256
    @staticmethod
7✔
257
    def _decode_hour_of_day_info(
7✔
258
        file: PathLike | str,
259
    ) -> dict:
260
        """
261
        Decode hour of day information.
262

263
        Parameters
264
        ----------
265
        file : Path or str
266

267
        Returns
268
        -------
269
        dict
270
        """
271
        if isinstance(file, str):
×
272
            file = Path(file)
×
273

274
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
NEW
275
            if nc is not None:
×
NEW
276
                with nc.Dataset(file, mode="r") as ds:
×
NEW
277
                    if "time" in ds.variables.keys():
×
NEW
278
                        hour = nc.num2date(ds["time"][0], ds["time"].units, ds["time"].calendar).hour
×
279
                    else:
NEW
280
                        hour = None
×
NEW
281
                return dict(hour_of_day=hour)
×
282
            else:
NEW
283
                warnings.warn("This is not currently implemented for h5netcdf. Install netCDF4.", stacklevel=2)
×
NEW
284
                return dict()
×
285

286
        elif file.is_dir() and file.suffix == ".zarr":
×
NEW
287
            warnings.warn("This is not currently implemented for zarr. Install netCDF4.", stacklevel=2)
×
288
            return dict()
×
289

290
        else:
291
            raise NotImplementedError()
×
292

293
    @staticmethod
7✔
294
    def _decode_time_info(  # noqa: C901
7✔
295
        file: PathLike | str | list[str] | None = None,
296
        data: dict | None = None,
297
        term: str | None = None,
298
        *,
299
        field: str | None = None,
300
    ) -> str | NaTType:
301
        """
302
        Decode time information.
303

304
        Parameters
305
        ----------
306
        file : os.PathLike or str, optional
307
        data : dict, optional
308
        term : str
309
        field : {"timedelta", "frequency"}
310

311
        Returns
312
        -------
313
        str or NaTType
314
        """
315
        if not file and not data and not term:
×
316
            raise ValueError("Nothing passed to parse time info from.")
×
317

318
        if field == "frequency":
×
319
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
320
        elif field == "timedelta":
×
321
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
322
        else:
323
            raise NotImplementedError()
×
324

325
        if term:
×
326
            if term in ["fx", "fixed"]:
×
327
                if field == "timedelta":
×
328
                    return pd.NaT
×
329
                return "fx"
×
330
            return pd.to_timedelta(time_dictionary[term])
×
331

332
        if data and not file:
×
333
            potential_time = data.get("frequency")
×
334
            if not potential_time:
×
335
                if hasattr(data, "time"):
×
336
                    time_units = data["time"].units
×
337
                    potential_time = time_units.split()[0]
×
338
                else:
339
                    msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
340

341
                    logger.warning(msg)
×
342
                    potential_time = "fx"
×
343
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
344
                msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
345
                logger.warning(msg)
×
346
                if field == "frequency":
×
347
                    return "fx"
×
348
                if field == "timedelta":
×
349
                    return pd.NaT
×
350
                raise ValueError()
×
351

352
            if field == "timedelta":
×
353
                if potential_time in ["fx", "fixed"]:
×
354
                    return pd.NaT
×
355
                return pd.to_timedelta(time_dictionary[potential_time])
×
356
            return time_dictionary[potential_time]
×
357

358
        if file and not data:
×
359
            for delimiter in ["_", "."]:
×
360
                file_parts = Path(file).stem.split(delimiter)
×
361
                potential_times = [segment for segment in file_parts if segment in time_dictionary.keys()]
×
362
                if potential_times:
×
363
                    if potential_times[0] in ["fx", "fixed"]:
×
364
                        if field == "frequency":
×
365
                            return "fx"
×
366
                        if field == "timedelta":
×
367
                            return pd.NaT
×
368
                        raise ValueError(f"Field `{field}` not supported.")
×
369
                    if field == "timedelta":
×
370
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
371
                    return time_dictionary[potential_times[0]]
×
372

373
        if file and data:
×
374
            for delimiter in ["_", "."]:
×
375
                file_parts = Path(file).stem.split(delimiter)
×
376
                potential_times = [segment for segment in file_parts if segment in time_dictionary.keys()]
×
377
                potential_time = data.get("frequency", "")
×
378
                if potential_time == "":
×
379
                    if hasattr(data, "time"):
×
380
                        time_units = data["time"].units
×
381
                        potential_time = time_units.split()[0]
×
382
                    else:
383
                        msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
384

385
                        logger.warning(msg)
×
386
                        potential_time = "fx"
×
387
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
388
                    msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
389

390
                    logger.warning(msg)
×
391
                    if "fx" in file_parts or "fixed" in file_parts:
×
392
                        if field == "frequency":
×
393
                            return "fx"
×
394
                        if field == "timedelta":
×
395
                            return pd.NaT
×
396
                        raise ValueError(f"Field `{field}` not supported.")
×
397

398
                if potential_time in potential_times:
×
399
                    return time_dictionary[potential_time]
×
400
                elif potential_times:
×
401
                    break
×
402

403
            msg = (
×
404
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
405
                "Performing more rigorous frequency checks."
406
            )
407
            logger.warning(msg)
×
408
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
409
                engine = "h5netcdf"
×
410
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
411
                engine = "zarr"
×
412
            else:
413
                raise DecoderError(f"File is not valid netcdf or zarr: {Path(file).name}")
×
414

415
            _ds = xr.open_dataset(
×
416
                file,
417
                engine=engine,
418
                drop_variables="time_bnds",
419
            )
420
            if not hasattr(_ds, "time"):
×
421
                logger.warning("Dataset does not contain time array. Assuming fixed variable.")
×
422
                if field == "frequency":
×
423
                    return "fx"
×
424
                if field == "timedelta":
×
425
                    return pd.NaT
×
426
                raise ValueError(f"Field `{field}` not supported.")
×
427
            else:
428
                _, found_freq = check_time_frequency(_ds.time)
×
429

430
            if found_freq in potential_times:
×
431
                msg = (
×
432
                    "Time frequency found in dataset on analysis was found in filename. "
433
                    f"Metadata for `{Path(file).name} is probably incorrect. "
434
                    f"Basing fields on `{found_freq}`."
435
                )
436
                logger.warning(msg)
×
437
                return time_dictionary[found_freq]
×
438
            elif found_freq in ["month", "mon"]:
×
439
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
440
                    if f in potential_times:
×
441
                        msg = f"Month-like time frequency found in dataset on analysis was found in filename. Basing fields on `{f}`."
×
442
                        logger.warning(msg)
×
443
                        return time_dictionary[f]
×
444
            else:
445
                msg = f"Time frequency found in dataset on analysis was not found in filename. Basing fields on `{found_freq}`."
×
446
                logger.warning(msg)
×
447
                return time_dictionary[found_freq]
×
448
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
449

450
    @staticmethod
7✔
451
    def _decode_version(file: PathLike | str, data: dict) -> dict:
7✔
452
        """
453
        Decode version information.
454

455
        Parameters
456
        ----------
457
        file : os.PathLike or str
458
        data : dict
459

460
        Returns
461
        -------
462
        dict
463
        """
464
        version_info = dict()
×
465
        try:
×
466
            version_info["version"] = data["version"]
×
467
        except KeyError:
×
468
            possible_version = Path(file).parent
×
469
            if re.match(r"^[vV]\d+", possible_version.name):
×
470
                version_info["version"] = possible_version.name
×
471
            else:
472
                possible_version_signature = possible_version.glob(f"{Path(file).stem}.v*")
×
473
                for sig in possible_version_signature:
×
474
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
475
                    if found_version:
×
476
                        version_info["version"] = found_version.group()
×
477
                        version_info["sha256sum"] = sig.open().read()
×
478
                        break
×
479
                else:
480
                    version_info["version"] = "vNotFound"
×
481
        return version_info
×
482

483
    @classmethod
7✔
484
    def decode_converted(cls, file: PathLike | str) -> dict:
7✔
485
        """
486
        Decode converted data.
487

488
        Parameters
489
        ----------
490
        file : os.PathLike or str
491

492
        Returns
493
        -------
494
        dict
495
        """
496
        facets = dict()
×
497
        try:
×
498
            variable, date, data = cls._from_dataset(file=file)
×
499
        except DecoderError:
×
500
            return facets
×
501

502
        facets.update(data)
×
503
        del facets["history"]
×
504

505
        facets["date"] = date
×
506

507
        file_format = data.get("output_format")
×
508
        if file_format:
×
509
            facets["format"] = file_format
×
510
        elif "format" in data:
×
511
            facets["format"] = data["format"]
×
512
        elif Path(file).suffix in [".nc", ".nc4"]:
×
513
            facets["format"] = "nc"
×
514
        elif Path(file).suffix in [".zarr"]:
×
515
            facets["format"] = "zarr"
×
516
        facets["variable"] = variable
×
517

518
        facets.update(cls._decode_version(data=data, file=file))
×
519
        facets.update(cls._decode_hour_of_day_info(file=file))
×
520

521
        try:
×
522
            if "frequency" not in facets:
×
523
                facets["timedelta"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
524
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
525
            facets["date_start"] = date_parser(date)
×
526
            facets["date_end"] = date_parser(date, end_of_period=True)
×
527
        except DecoderError:  # noqa: S110
×
528
            pass
×
529

530
        return facets
×
531

532
    @staticmethod
7✔
533
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
7✔
534
        raise NotImplementedError()
×
535

536
    @staticmethod
7✔
537
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
7✔
538
        raise NotImplementedError()
×
539

540
    @staticmethod
7✔
541
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
7✔
542
        raise NotImplementedError()
×
543

544
    @classmethod
7✔
545
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
7✔
546
        if "Derived" in Path(file).parents:
×
547
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
548

549
        facets = dict()
×
550
        try:
×
551
            variable, date, data = cls._from_dataset(file=file)
×
552
        except DecoderError:
×
553
            return facets
×
554

555
        facets["activity"] = data["activity_id"]
×
556
        facets["mip_era"] = data["project_id"]
×
557
        facets["bias_adjust_institution"] = "PCIC"
×
558
        facets["date"] = date
×
559
        facets["domain"] = data["domain"]
×
560
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
561
        facets["format"] = "netcdf"
×
562
        facets["institution"] = data["GCM__institution_id"]
×
563
        facets["member"] = (
×
564
            f"r{data['GCM__realization_index']}i{data['GCM__initialization_index']}p{data['GCM__physics_index']}f{data['GCM__forcing_index']}"
565
        )
566
        facets["processing_level"] = "biasadjusted"
×
567
        facets["bias_adjust_project"] = "CanDCS-U6"
×
568
        facets["source"] = data["GCM__source_id"]
×
569
        facets["type"] = "simulation"
×
570
        facets["variable"] = variable
×
571

572
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
573
        if facets["version"] is None:
×
574
            facets.update(find_version_hash(file=file))
×
575

576
        facets.update(cls._decode_hour_of_day_info(file=file))
×
577

578
        try:
×
579
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
580
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
581
            facets["date_start"] = date_parser(date)
×
582
            facets["date_end"] = date_parser(date, end_of_period=True)
×
583
        except DecoderError:  # noqa: S110
×
584
            pass
×
585

586
        return facets
×
587

588
    @classmethod
7✔
589
    def decode_cmip6(cls, file: PathLike | str) -> dict:
7✔
590
        facets = dict()
×
591
        try:
×
592
            variable, date, data = cls._from_dataset(file=file)
×
593
        except DecoderError:
×
594
            return facets
×
595

596
        facets["activity"] = data["activity_id"]
×
597
        facets["date"] = date
×
598
        facets["domain"] = "global"
×
599
        facets["experiment"] = data["experiment_id"]
×
600
        facets["format"] = "netcdf"
×
601
        facets["grid_label"] = data["grid_label"]
×
602
        facets["institution"] = data["institution_id"]
×
603
        facets["member"] = data["variant_label"]
×
604
        facets["modeling_realm"] = data["realm"]
×
605
        facets["processing_level"] = "raw"
×
606
        facets["mip_era"] = data["mip_era"]
×
607
        facets["source"] = data["source_id"]
×
608
        facets["type"] = "simulation"
×
609
        facets["variable"] = variable
×
610
        facets.update(cls._decode_version(data=data, file=file))
×
611
        facets.update(cls._decode_hour_of_day_info(file=file))
×
612

613
        try:
×
614
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
615
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
616
            facets["date_start"] = date_parser(date)
×
617
            facets["date_end"] = date_parser(date, end_of_period=True)
×
618
        except DecoderError:  # noqa: S110
×
619
            pass
×
620

621
        return facets
×
622

623
    @classmethod
7✔
624
    def decode_cmip5(cls, file: PathLike | str) -> dict:
7✔
625
        facets = dict()
×
626
        try:
×
627
            variable, date, data = cls._from_dataset(file=file)
×
628
        except DecoderError:
×
629
            return facets
×
630

631
        facets["activity"] = "CMIP"
×
632
        facets["date"] = date
×
633
        facets["domain"] = "global"
×
634
        facets["experiment"] = data["experiment_id"]
×
635
        facets["format"] = "netcdf"
×
636
        facets["institution"] = data["institute_id"]
×
637
        facets["member"] = data["parent_experiment_rip"]
×
638
        facets["modeling_realm"] = data["modeling_realm"]
×
639
        facets["processing_level"] = "raw"
×
640
        facets["mip_era"] = data["project_id"]
×
641
        facets["source"] = data["model_id"]
×
642
        facets["type"] = "simulation"
×
643
        facets["variable"] = variable
×
644
        facets.update(cls._decode_version(data=data, file=file))
×
645
        facets.update(cls._decode_hour_of_day_info(file=file))
×
646

647
        try:
×
648
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
649
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
650
            facets["date_start"] = date_parser(date)
×
651
            facets["date_end"] = date_parser(date, end_of_period=True)
×
652
        except DecoderError:  # noqa: S110
×
653
            pass
×
654

655
        return facets
×
656

657
    @classmethod
7✔
658
    def decode_cordex(cls, file: PathLike | str) -> dict:
7✔
659
        facets = dict()
×
660
        try:
×
661
            variable, date, data = cls._from_dataset(file=file)
×
662
        except DecoderError:
×
663
            return dict()
×
664

665
        # FIXME: What to do about our internal data that breaks all established conventions?
666
        facets["activity"] = "CORDEX"
×
667

668
        if data.get("project_id") == "" or data.get("project_id") is None:
×
669
            facets["mip_era"] = "internal"
×
670
        elif data.get("project_id") == "CORDEX":
×
671
            facets["mip_era"] = "CMIP5"
×
672

673
        if date == "r0i0p0":
×
674
            facets["date"] = "fx"
×
675
        else:
676
            facets["date"] = date
×
677

678
        domain = data.get("CORDEX_domain")
×
679
        if domain:
×
680
            facets["domain"] = domain.strip()
×
681
        else:
682
            domain = data.get("ouranos_domain_name")
×
683
            if domain:
×
684
                facets["domain"] = domain.strip()
×
685
            else:
686
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
687
                logger.error(msg)
×
688
                raise NotImplementedError(msg)
×
689

690
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
691
        aws_keys = data.get("intake_esm_dataset_key")
×
692
        if aws_keys:
×
693
            facets["domain"] = aws_keys.split(".")[3]
×
694

695
        title = data.get("title")
×
696
        if title:
×
697
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
698
            if regridded_domain_found:
×
699
                facets["domain"] = regridded_domain_found.group()
×
700

701
        # The logic here is awful, but the information is bad to begin with.
702
        driving_model = ""
×
703
        driving_institution = ""
×
704

705
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
706
        if VALIDATION_ENABLED:
×
707
            if driving_institution_parts[0] in INSTITUTIONS:
×
708
                driving_institution = driving_institution_parts[0]
×
709
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
710
                driving_institution = "-".join(driving_institution_parts[:2])
×
711
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
712
                driving_institution = "-".join(driving_institution_parts[:3])
×
713
        else:
714
            logger.warning("CORDEX Metadata validation checks require PyESSV. Driving institution cannot be determined.")
×
715
            driving_model = data["driving_model_id"]
×
716

717
        if data["driving_model_id"].startswith("GFDL"):
×
718
            driving_institution = "NOAA-GFDL"
×
719
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
720
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
721
            driving_institution = "MPI-M"
×
722
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
723
        elif data["driving_model_id"].startswith("HadGEM2"):
×
724
            driving_institution = "MOHC"
×
725
            driving_model = f"MOHC-{data['driving_model_id']}"
×
726
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
727
            driving_institution = "CNRM-CERFACS"
×
728
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
729

730
        elif VALIDATION_ENABLED and not driving_institution:
×
731
            raise DecoderError(f"driving_institution (from driving_model_id: `{data['driving_model_id']}`) is not valid.")
×
732

733
        facets["driving_institution"] = driving_institution.strip()
×
734
        if driving_model:
×
735
            facets["driving_model"] = driving_model.strip()
×
736
        else:
737
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
738

739
        facets["format"] = "netcdf"
×
740

741
        if data["institute_id"].strip() == "Our.":
×
742
            facets["institution"] = "Ouranos"
×
743
        else:
744
            facets["institution"] = data["institute_id"].strip()
×
745

746
        facets["processing_level"] = "raw"
×
747
        facets["source"] = data["model_id"]
×
748
        facets["type"] = "simulation"
×
749
        facets["variable"] = variable
×
750

751
        facets.update(cls._decode_version(data=data, file=file))
×
752
        facets.update(cls._decode_hour_of_day_info(file=file))
×
753

754
        try:
×
755
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
756
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
757
            facets["date_start"] = date_parser(date)
×
758
            facets["date_end"] = date_parser(date, end_of_period=True)
×
759
        except DecoderError:  # noqa: S110
×
760
            pass
×
761

762
        try:
×
763
            facets["experiment"] = data["experiment_id"].strip()
×
764
        except KeyError:
×
765
            facets["experiment"] = data["driving_experiment_name"].strip()
×
766

767
        try:
×
768
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
769
                facets["member"] = data.get(potential_member)
×
770
                if facets["member"] == "N/A":
×
771
                    raise KeyError()
×
772
                else:
773
                    break
×
774
            if facets["member"] is None:
×
775
                raise KeyError()
×
776
        except KeyError:
×
777
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
778

779
        return facets
×
780

781
    @classmethod
7✔
782
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
7✔
783
        facets = dict()
×
784
        try:
×
785
            variable, date, data = cls._from_dataset(file=file)
×
786
        except DecoderError:
×
787
            return facets
×
788

789
        facets["activity"] = "ISIMIP"
×
790
        facets["mip_era"] = data["project_id"]
×
791
        facets["date"] = date
×
792
        facets["domain"] = "global"
×
793
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
794
        facets["experiment"] = data["experiment_id"]
×
795
        facets["format"] = "netcdf"
×
796
        facets["impact_model"] = data["impact_model_id"]
×
797
        facets["institution"] = data["institute_id"]
×
798
        facets["member"] = data["driving_model_ensemble_member"]
×
799
        facets["modeling_realm"] = data["modeling_realm"]
×
800
        facets["social_forcing_id"] = data["social_forcing_id"]
×
801
        facets["source"] = data["model_id"]
×
802
        facets["type"] = "simulation"
×
803
        facets["variable"] = variable
×
804

805
        facets.update(cls._decode_version(data=data, file=file))
×
806
        facets.update(cls._decode_hour_of_day_info(file=file))
×
807

808
        try:
×
809
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
810
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
811
            facets["date_start"] = date_parser(date)
×
812
            facets["date_end"] = date_parser(date, end_of_period=True)
×
813
        except DecoderError:  # noqa: S110
×
814
            pass
×
815

816
        return facets
×
817

818
    @classmethod
7✔
819
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
7✔
820
        facets = dict()
×
821
        try:
×
822
            variable, date, data = cls._from_dataset(file=file)
×
823
        except DecoderError:
×
824
            return facets
×
825

826
        facets["experiment"] = data["scenario"]
×
827
        facets["activity"] = "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
×
828
        facets["institution"] = data["cmip6_institution_id"]
×
829
        facets["member"] = data["variant_label"]
×
830
        facets["processing_level"] = "biasadjusted"
×
831
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
832
        facets["bias_adjust_institution"] = "NASA"
×
833
        facets["mip_era"] = "CMIP6"
×
834
        facets["source"] = data["cmip6_source_id"]
×
835
        facets["type"] = "simulation"
×
836
        facets["variable"] = variable
×
837
        facets.update(cls._decode_version(data=data, file=file))
×
838
        facets.update(cls._decode_hour_of_day_info(file=file))
×
839

840
        try:
×
841
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
842
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
843
            facets["date_start"] = date_parser(date)
×
844
            facets["date_end"] = date_parser(date, end_of_period=True)
×
845
        except DecoderError:  # noqa: S110
×
846
            pass
×
847

848
        return facets
×
849

850
    @classmethod
7✔
851
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
7✔
852
        facets = dict()
×
853
        try:
×
854
            variable, date, data = cls._from_dataset(file=file)
×
855
        except DecoderError:
×
856
            return facets
×
857

858
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
859
        facets["processing_level"] = "biasadjusted"
×
860
        facets["version"] = "1.0.0"
×
861
        facets["domain"] = "NAM"
×
862
        for f in [
×
863
            "experiment",
864
            "activity",
865
            "institution",
866
            "member",
867
            "bias_adjust_institution",
868
            "mip_era",
869
            "source",
870
            "type",
871
        ]:
872
            facets[f] = data[f"cat:{f}"]
×
873
        facets["variable"] = variable
×
874
        # facets.update(cls._decode_version(data=data, file=file))
875
        facets.update(cls._decode_hour_of_day_info(file=file))
×
876

877
        try:
×
878
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
879
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
880
            facets["date_start"] = date_parser(date)
×
881
            facets["date_end"] = date_parser(date, end_of_period=True)
×
882
        except DecoderError:  # noqa: S110
×
883
            pass
×
884

885
        return facets
×
886

887
    @classmethod
7✔
888
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
7✔
889
        facets = dict()
×
890
        try:
×
891
            variable, date, data = cls._from_dataset(file=file)
×
892
        except DecoderError:
×
893
            return facets
×
894

895
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
896
        facets["processing_level"] = "biasadjusted"
×
897
        facets["version"] = "1.0.0"
×
898
        facets["domain"] = "NAM"
×
899
        for f in [
×
900
            "experiment",
901
            "activity",
902
            "institution",
903
            "member",
904
            "bias_adjust_institution",
905
            "mip_era",
906
            "source",
907
            "type",
908
        ]:
909
            facets[f] = data[f"cat:{f}"]
×
910
        facets["variable"] = variable
×
911
        # facets.update(cls._decode_version(data=data, file=file))
912
        facets.update(cls._decode_hour_of_day_info(file=file))
×
913

914
        try:
×
915
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
916
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
917
            facets["date_start"] = date_parser(date)
×
918
            facets["date_end"] = date_parser(date, end_of_period=True)
×
919
        except DecoderError:  # noqa: S110
×
920
            pass
×
921

922
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc