• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 17249301051

26 Aug 2025 08:11PM UTC coverage: 18.642% (-0.08%) from 18.726%
17249301051

push

github

web-flow
Update cookiecutter, format code for new conventions (#266)

### What kind of change does this PR introduce?

* Updates the cookiecutter to the latest commit
* Removes `black`, `isort`, and `blackdoc`
* More `pre-commit` hooks have been added (most are disabled as they
require major refactoring efforts).

### Does this PR introduce a breaking change?

Development tooling has changed significantly. Running `black` or
`isort` will create inconsistent formatting.

### Other information:

`ruff` does nearly all that we need for code formatting. `flake8` is
still around for docstring-specific checks that are not possible via
`ruff` (requires dynamic checking).

49 of 456 new or added lines in 48 files covered. (10.75%)

207 existing lines in 40 files now uncovered.

1194 of 6405 relevant lines covered (18.64%)

1.29 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.46
/src/miranda/decode/_decoder.py
1
from __future__ import annotations
7✔
2
import logging
7✔
3
import multiprocessing as mp
7✔
4
import os
7✔
5
import re
7✔
6
import warnings
7✔
7
from functools import partial
7✔
8
from os import PathLike
7✔
9
from pathlib import Path
7✔
10
from types import GeneratorType
7✔
11

12
import netCDF4 as nc  # noqa
7✔
13
import pandas as pd
7✔
14
import schema
7✔
15
import xarray as xr
7✔
16
import zarr
7✔
17
from pandas._libs.tslibs import NaTType  # noqa
7✔
18

19
from miranda.convert.utils import date_parser, find_version_hash  # noqa
7✔
20
from miranda.cv import VALIDATION_ENABLED
7✔
21
from miranda.units import check_time_frequency
7✔
22

23
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
7✔
24

25

26
if VALIDATION_ENABLED:
7✔
27
    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
7✔
28
    from miranda.validate import FACETS_SCHEMA  # noqa
7✔
29

30

31
logger = logging.getLogger("miranda.decode.decoder")
7✔
32

33
__all__ = [
7✔
34
    "Decoder",
35
    "guess_project",
36
]
37

38

39
def guess_project(file: os.PathLike | str) -> str:
7✔
40
    """
41
    Guess the name of the project
42

43
    Parameters
44
    ----------
45
    file : str or os.PathLike
46

47
    Returns
48
    -------
49
    str
50
    """
51
    file_name = Path(file).stem
×
52

53
    potential_names = file_name.split("_")
×
54
    if VALIDATION_ENABLED:
×
55
        for project, models in PROJECT_MODELS.items():
×
56
            if any([model in potential_names for model in models]):
×
57
                return project
×
NEW
58
        raise DecoderError(f"Unable to determine project from file name: '{file_name}'.")
×
UNCOV
59
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
60

61

62
class Decoder:
7✔
63
    project = None
7✔
64
    guess = False
7✔
65
    _file_facets = dict()
7✔
66

67
    def __init__(self, project: str | None):
7✔
68
        self.project = project
×
69

70
    @staticmethod
7✔
71
    def _decoder(
7✔
72
        d: dict,
73
        fail_early: bool,
74
        proj: str,
75
        guess: bool,
76
        lock: mp.Lock,
77
        file: str | Path,
78
    ) -> None:
79
        if proj is None:
×
80
            if guess:
×
81
                try:
×
82
                    proj = guess_project(file)
×
83
                except DecoderError:
×
NEW
84
                    print(f"Unable to determine 'activity': Signature for 'activity' must be set manually for file: {file}.")
×
UNCOV
85
                    if fail_early:
×
86
                        raise
×
87
            else:
88
                proj = "converted"
×
89

90
        decode_function_name = f"decode_{proj.lower().replace('-', '_')}"
×
91
        try:
×
92
            with lock:
×
93
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
94
                if fail_early:
×
95
                    if VALIDATION_ENABLED:
×
96
                        FACETS_SCHEMA.validate(_deciphered)
×
97
                    else:
NEW
98
                        print("Validation requires pyessv-archive source files. Skipping validation checks.")
×
NEW
99
                print(f"Deciphered the following from {Path(file).name}:\n{_deciphered.items()}")
×
UNCOV
100
                d[file] = _deciphered
×
101

102
        except (AttributeError, NotImplementedError):
×
103
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
104
            raise
×
105
        except schema.SchemaError as e:
×
106
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
107

108
    def decode(
7✔
109
        self,
110
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
111
        chunks: int | None = None,
112
        raise_error: bool = False,
113
    ) -> None:
114
        """
115
        Decode facets from file or list of files.
116

117
        Parameters
118
        ----------
119
        files : str or Path or list of str or Path or generator
120
            The files to decode.
121
        chunks : int, optional
122
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
123
        raise_error : bool
124
            Whether to raise an error if a file cannot be decoded.
125
        """
126
        if isinstance(files, (str, os.PathLike)):
×
127
            files = [files]
×
128

129
        if chunks is None and isinstance(files, list):
×
130
            if len(files) >= 10:
×
131
                chunk_size = 10
×
132
            elif 1 <= len(files) < 10:
×
133
                chunk_size = len(files)
×
134
            else:
135
                raise ValueError("No file entries found.")
×
136
        elif isinstance(files, GeneratorType):
×
137
            chunk_size = 10
×
138
        else:
139
            chunk_size = chunks
×
140

141
        if self.project is None:
×
NEW
142
            warnings.warn("The decoder 'project' is not set; Decoding step will be much slower.", stacklevel=2)
×
143
        else:
144
            msg = f"Deciphering metadata with project = '{self.project}'"
×
145
            logger.info(msg)
×
146

147
        with mp.Manager() as manager:
×
148
            _file_facets = manager.dict()
×
149
            lock = manager.Lock()
×
NEW
150
            func = partial(self._decoder, _file_facets, raise_error, self.project, self.guess, lock)
×
151

152
            with mp.Pool() as pool:
×
153
                pool.imap(func, files, chunksize=chunk_size)
×
154
                pool.close()
×
155
                pool.join()
×
156

157
            self._file_facets.update(_file_facets)
×
158

159
    def facets_table(self):
7✔
160
        raise NotImplementedError()
×
161

162
    def file_facets(self) -> dict[os.PathLike, dict]:
7✔
163
        return self._file_facets
×
164

165
    @classmethod
7✔
166
    def _from_dataset(cls, file: Path | str) -> (str, str, dict):
7✔
167
        file_name = Path(file).stem
×
168

169
        try:
×
170
            variable_name = cls._decode_primary_variable(file)
×
171
        except DecoderError:
×
172
            msg = f"Unable to open dataset: {file.name}"
×
173
            logger.error(msg)
×
174
            raise
×
175

176
        datetimes = file_name.split("_")[-1]
×
177

178
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
179
            with nc.Dataset(file, mode="r") as ds:
×
180
                data = dict()
×
181
                for k in ds.ncattrs():
×
182
                    data[k] = getattr(ds, k)
×
183
        elif file.is_dir() and file.suffix == ".zarr":
×
184
            with zarr.open(file, mode="r") as ds:
×
185
                data = ds.attrs.asdict()
×
186
        else:
187
            raise DecoderError(f"Unable to read dataset: `{file.name}`.")
×
188
        return variable_name, datetimes, data
×
189

190
    @staticmethod
7✔
191
    def _decode_primary_variable(file: Path) -> str:
7✔
192
        """
193
        Attempts to find the primary variable of a netCDF
194

195
        Parameters
196
        ----------
197
        file: Path
198

199
        Returns
200
        -------
201
        str
202
        """
203
        dimsvar_dict = dict()
×
204
        coords = (
×
205
            "height",
206
            "lat",
207
            "latitude",
208
            "lev",
209
            "level",
210
            "lon",
211
            "longitude",
212
            "rlat",
213
            "rlon",
214
            "rotated_pole",
215
            "time",
216
        )
217
        try:
×
218
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
219
                with nc.Dataset(file, mode="r") as ds:
×
220
                    for var_name, var_attrs in ds.variables.items():
×
NEW
221
                        dimsvar_dict[var_name] = {k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()}
×
UNCOV
222
                for k in dimsvar_dict.keys():
×
223
                    if not str(k).startswith(coords) and k in file.stem:
×
224
                        return str(k)
×
225

226
            elif file.is_dir() and file.suffix == ".zarr":
×
227
                with zarr.open(str(file), mode="r") as ds:
×
228
                    for k in ds.array_keys():
×
229
                        if not str(k).startswith(coords) and k in file.stem:
×
230
                            return str(k)
×
231
            else:
NEW
232
                msg = "File format is not supported."
×
NEW
233
                raise NotImplementedError(msg)
×
NEW
234
        except ValueError as err:
×
NEW
235
            msg = f"Unable to open dataset: {file.name}"
×
NEW
236
            raise DecoderError(msg) from err
×
237

238
    @staticmethod
7✔
239
    def _decode_hour_of_day_info(
7✔
240
        file: PathLike | str,
241
    ) -> dict:
242
        """
243
        Decode hour of day information.
244

245
        Parameters
246
        ----------
247
        file : Path or str
248

249
        Returns
250
        -------
251
        dict
252
        """
253
        if isinstance(file, str):
×
254
            file = Path(file)
×
255

256
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
257
            with nc.Dataset(file, mode="r") as ds:
×
258
                if "time" in ds.variables.keys():
×
NEW
259
                    hour = nc.num2date(ds["time"][0], ds["time"].units, ds["time"].calendar).hour
×
260
                else:
261
                    hour = None
×
262
            return dict(hour_of_day=hour)
×
263

264
        elif file.is_dir() and file.suffix == ".zarr":
×
NEW
265
            warnings.warn("This is not currently implemented", stacklevel=2)
×
266

267
            # with zarr.open(str(file), mode="r") as ds:
268
            #     if "time" in ds.array_keys():
269
            #         pass
270

271
            return dict()
×
272

273
        else:
274
            raise NotImplementedError()
×
275

276
    @staticmethod
7✔
277
    def _decode_time_info(  # noqa: C901
7✔
278
        file: PathLike | str | list[str] | None = None,
279
        data: dict | None = None,
280
        term: str | None = None,
281
        *,
282
        field: str | None = None,
283
    ) -> str | NaTType:
284
        """
285
        Decode time information.
286

287
        Parameters
288
        ----------
289
        file : os.PathLike or str, optional
290
        data : dict, optional
291
        term : str
292
        field : {"timedelta", "frequency"}
293

294
        Returns
295
        -------
296
        str or NaTType
297
        """
298
        if not file and not data and not term:
×
299
            raise ValueError("Nothing passed to parse time info from.")
×
300

301
        if field == "frequency":
×
302
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
303
        elif field == "timedelta":
×
304
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
305
        else:
306
            raise NotImplementedError()
×
307

308
        if term:
×
309
            if term in ["fx", "fixed"]:
×
310
                if field == "timedelta":
×
311
                    return pd.NaT
×
312
                return "fx"
×
313
            return pd.to_timedelta(time_dictionary[term])
×
314

315
        if data and not file:
×
316
            potential_time = data.get("frequency")
×
317
            if not potential_time:
×
318
                if hasattr(data, "time"):
×
319
                    time_units = data["time"].units
×
320
                    potential_time = time_units.split()[0]
×
321
                else:
322
                    msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
323

324
                    logger.warning(msg)
×
325
                    potential_time = "fx"
×
326
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
327
                msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
328
                logger.warning(msg)
×
329
                if field == "frequency":
×
330
                    return "fx"
×
331
                if field == "timedelta":
×
332
                    return pd.NaT
×
333
                raise ValueError()
×
334

335
            if field == "timedelta":
×
336
                if potential_time in ["fx", "fixed"]:
×
337
                    return pd.NaT
×
338
                return pd.to_timedelta(time_dictionary[potential_time])
×
339
            return time_dictionary[potential_time]
×
340

341
        if file and not data:
×
342
            for delimiter in ["_", "."]:
×
343
                file_parts = Path(file).stem.split(delimiter)
×
NEW
344
                potential_times = [segment for segment in file_parts if segment in time_dictionary.keys()]
×
UNCOV
345
                if potential_times:
×
346
                    if potential_times[0] in ["fx", "fixed"]:
×
347
                        if field == "frequency":
×
348
                            return "fx"
×
349
                        if field == "timedelta":
×
350
                            return pd.NaT
×
351
                        raise ValueError(f"Field `{field}` not supported.")
×
352
                    if field == "timedelta":
×
353
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
354
                    return time_dictionary[potential_times[0]]
×
355

356
        if file and data:
×
357
            for delimiter in ["_", "."]:
×
358
                file_parts = Path(file).stem.split(delimiter)
×
NEW
359
                potential_times = [segment for segment in file_parts if segment in time_dictionary.keys()]
×
UNCOV
360
                potential_time = data.get("frequency", "")
×
361
                if potential_time == "":
×
362
                    if hasattr(data, "time"):
×
363
                        time_units = data["time"].units
×
364
                        potential_time = time_units.split()[0]
×
365
                    else:
366
                        msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
367

368
                        logger.warning(msg)
×
369
                        potential_time = "fx"
×
370
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
371
                    msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
372

373
                    logger.warning(msg)
×
374
                    if "fx" in file_parts or "fixed" in file_parts:
×
375
                        if field == "frequency":
×
376
                            return "fx"
×
377
                        if field == "timedelta":
×
378
                            return pd.NaT
×
379
                        raise ValueError(f"Field `{field}` not supported.")
×
380

381
                if potential_time in potential_times:
×
382
                    return time_dictionary[potential_time]
×
383
                elif potential_times:
×
384
                    break
×
385

386
            msg = (
×
387
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
388
                "Performing more rigorous frequency checks."
389
            )
390
            logger.warning(msg)
×
391
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
392
                engine = "h5netcdf"
×
393
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
394
                engine = "zarr"
×
395
            else:
NEW
396
                raise DecoderError(f"File is not valid netcdf or zarr: {Path(file).name}")
×
397

398
            _ds = xr.open_dataset(
×
399
                file,
400
                engine=engine,
401
                drop_variables="time_bnds",
402
            )
403
            if not hasattr(_ds, "time"):
×
NEW
404
                logger.warning("Dataset does not contain time array. Assuming fixed variable.")
×
UNCOV
405
                if field == "frequency":
×
406
                    return "fx"
×
407
                if field == "timedelta":
×
408
                    return pd.NaT
×
409
                raise ValueError(f"Field `{field}` not supported.")
×
410
            else:
411
                _, found_freq = check_time_frequency(_ds.time)
×
412

413
            if found_freq in potential_times:
×
414
                msg = (
×
415
                    "Time frequency found in dataset on analysis was found in filename. "
416
                    f"Metadata for `{Path(file).name} is probably incorrect. "
417
                    f"Basing fields on `{found_freq}`."
418
                )
419
                logger.warning(msg)
×
420
                return time_dictionary[found_freq]
×
421
            elif found_freq in ["month", "mon"]:
×
422
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
423
                    if f in potential_times:
×
424
                        msg = f"Month-like time frequency found in dataset on analysis was found in filename. Basing fields on `{f}`."
×
425
                        logger.warning(msg)
×
426
                        return time_dictionary[f]
×
427
            else:
NEW
428
                msg = f"Time frequency found in dataset on analysis was not found in filename. Basing fields on `{found_freq}`."
×
UNCOV
429
                logger.warning(msg)
×
430
                return time_dictionary[found_freq]
×
431
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
432

433
    @staticmethod
7✔
434
    def _decode_version(file: PathLike | str, data: dict) -> dict:
7✔
435
        """
436
        Decode version information.
437

438
        Parameters
439
        ----------
440
        file : os.PathLike or str
441
        data : dict
442

443
        Returns
444
        -------
445
        dict
446
        """
447
        version_info = dict()
×
448
        try:
×
449
            version_info["version"] = data["version"]
×
450
        except KeyError:
×
451
            possible_version = Path(file).parent
×
452
            if re.match(r"^[vV]\d+", possible_version.name):
×
453
                version_info["version"] = possible_version.name
×
454
            else:
NEW
455
                possible_version_signature = possible_version.glob(f"{Path(file).stem}.v*")
×
UNCOV
456
                for sig in possible_version_signature:
×
457
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
458
                    if found_version:
×
459
                        version_info["version"] = found_version.group()
×
460
                        version_info["sha256sum"] = sig.open().read()
×
461
                        break
×
462
                else:
463
                    version_info["version"] = "vNotFound"
×
464
        return version_info
×
465

466
    @classmethod
7✔
467
    def decode_converted(cls, file: PathLike | str) -> dict:
7✔
468
        """
469
        Decode converted data.
470

471
        Parameters
472
        ----------
473
        file : os.PathLike or str
474

475
        Returns
476
        -------
477
        dict
478
        """
479
        facets = dict()
×
480
        try:
×
481
            variable, date, data = cls._from_dataset(file=file)
×
482
        except DecoderError:
×
483
            return facets
×
484

485
        facets.update(data)
×
486
        del facets["history"]
×
487

488
        facets["date"] = date
×
489

490
        file_format = data.get("output_format")
×
491
        if file_format:
×
492
            facets["format"] = file_format
×
493
        elif "format" in data:
×
494
            facets["format"] = data["format"]
×
495
        elif Path(file).suffix in [".nc", ".nc4"]:
×
496
            facets["format"] = "nc"
×
497
        elif Path(file).suffix in [".zarr"]:
×
498
            facets["format"] = "zarr"
×
499
        facets["variable"] = variable
×
500

501
        facets.update(cls._decode_version(data=data, file=file))
×
502
        facets.update(cls._decode_hour_of_day_info(file=file))
×
503

504
        try:
×
505
            if "frequency" not in facets:
×
NEW
506
                facets["timedelta"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
507
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
508
            facets["date_start"] = date_parser(date)
×
509
            facets["date_end"] = date_parser(date, end_of_period=True)
×
510
        except DecoderError:  # noqa: S110
×
511
            pass
×
512

513
        return facets
×
514

515
    @staticmethod
7✔
516
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
7✔
517
        raise NotImplementedError()
×
518

519
    @staticmethod
7✔
520
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
7✔
521
        raise NotImplementedError()
×
522

523
    @staticmethod
7✔
524
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
7✔
525
        raise NotImplementedError()
×
526

527
    @classmethod
7✔
528
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
7✔
529
        if "Derived" in Path(file).parents:
×
530
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
531

532
        facets = dict()
×
533
        try:
×
534
            variable, date, data = cls._from_dataset(file=file)
×
535
        except DecoderError:
×
536
            return facets
×
537

538
        facets["activity"] = data["activity_id"]
×
539
        facets["mip_era"] = data["project_id"]
×
540
        facets["bias_adjust_institution"] = "PCIC"
×
541
        facets["date"] = date
×
542
        facets["domain"] = data["domain"]
×
543
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
544
        facets["format"] = "netcdf"
×
545
        facets["institution"] = data["GCM__institution_id"]
×
546
        facets["member"] = (
×
547
            f"r{data['GCM__realization_index']}i{data['GCM__initialization_index']}p{data['GCM__physics_index']}f{data['GCM__forcing_index']}"
548
        )
549
        facets["processing_level"] = "biasadjusted"
×
550
        facets["bias_adjust_project"] = "CanDCS-U6"
×
551
        facets["source"] = data["GCM__source_id"]
×
552
        facets["type"] = "simulation"
×
553
        facets["variable"] = variable
×
554

555
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
556
        if facets["version"] is None:
×
557
            facets.update(find_version_hash(file=file))
×
558

559
        facets.update(cls._decode_hour_of_day_info(file=file))
×
560

561
        try:
×
NEW
562
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
563
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
564
            facets["date_start"] = date_parser(date)
×
565
            facets["date_end"] = date_parser(date, end_of_period=True)
×
566
        except DecoderError:  # noqa: S110
×
567
            pass
×
568

569
        return facets
×
570

571
    @classmethod
7✔
572
    def decode_cmip6(cls, file: PathLike | str) -> dict:
7✔
573
        facets = dict()
×
574
        try:
×
575
            variable, date, data = cls._from_dataset(file=file)
×
576
        except DecoderError:
×
577
            return facets
×
578

579
        facets["activity"] = data["activity_id"]
×
580
        facets["date"] = date
×
581
        facets["domain"] = "global"
×
582
        facets["experiment"] = data["experiment_id"]
×
583
        facets["format"] = "netcdf"
×
584
        facets["grid_label"] = data["grid_label"]
×
585
        facets["institution"] = data["institution_id"]
×
586
        facets["member"] = data["variant_label"]
×
587
        facets["modeling_realm"] = data["realm"]
×
588
        facets["processing_level"] = "raw"
×
589
        facets["mip_era"] = data["mip_era"]
×
590
        facets["source"] = data["source_id"]
×
591
        facets["type"] = "simulation"
×
592
        facets["variable"] = variable
×
593
        facets.update(cls._decode_version(data=data, file=file))
×
594
        facets.update(cls._decode_hour_of_day_info(file=file))
×
595

596
        try:
×
NEW
597
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
598
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
599
            facets["date_start"] = date_parser(date)
×
600
            facets["date_end"] = date_parser(date, end_of_period=True)
×
601
        except DecoderError:  # noqa: S110
×
602
            pass
×
603

604
        return facets
×
605

606
    @classmethod
7✔
607
    def decode_cmip5(cls, file: PathLike | str) -> dict:
7✔
608
        facets = dict()
×
609
        try:
×
610
            variable, date, data = cls._from_dataset(file=file)
×
611
        except DecoderError:
×
612
            return facets
×
613

614
        facets["activity"] = "CMIP"
×
615
        facets["date"] = date
×
616
        facets["domain"] = "global"
×
617
        facets["experiment"] = data["experiment_id"]
×
618
        facets["format"] = "netcdf"
×
619
        facets["institution"] = data["institute_id"]
×
620
        facets["member"] = data["parent_experiment_rip"]
×
621
        facets["modeling_realm"] = data["modeling_realm"]
×
622
        facets["processing_level"] = "raw"
×
623
        facets["mip_era"] = data["project_id"]
×
624
        facets["source"] = data["model_id"]
×
625
        facets["type"] = "simulation"
×
626
        facets["variable"] = variable
×
627
        facets.update(cls._decode_version(data=data, file=file))
×
628
        facets.update(cls._decode_hour_of_day_info(file=file))
×
629

630
        try:
×
NEW
631
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
632
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
633
            facets["date_start"] = date_parser(date)
×
634
            facets["date_end"] = date_parser(date, end_of_period=True)
×
635
        except DecoderError:  # noqa: S110
×
636
            pass
×
637

638
        return facets
×
639

640
    @classmethod
7✔
641
    def decode_cordex(cls, file: PathLike | str) -> dict:
7✔
642
        facets = dict()
×
643
        try:
×
644
            variable, date, data = cls._from_dataset(file=file)
×
645
        except DecoderError:
×
646
            return dict()
×
647

648
        # FIXME: What to do about our internal data that breaks all established conventions?
649
        facets["activity"] = "CORDEX"
×
650

651
        if data.get("project_id") == "" or data.get("project_id") is None:
×
652
            facets["mip_era"] = "internal"
×
653
        elif data.get("project_id") == "CORDEX":
×
654
            facets["mip_era"] = "CMIP5"
×
655

656
        if date == "r0i0p0":
×
657
            facets["date"] = "fx"
×
658
        else:
659
            facets["date"] = date
×
660

661
        domain = data.get("CORDEX_domain")
×
662
        if domain:
×
663
            facets["domain"] = domain.strip()
×
664
        else:
665
            domain = data.get("ouranos_domain_name")
×
666
            if domain:
×
667
                facets["domain"] = domain.strip()
×
668
            else:
669
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
670
                logger.error(msg)
×
671
                raise NotImplementedError(msg)
×
672

673
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
674
        aws_keys = data.get("intake_esm_dataset_key")
×
675
        if aws_keys:
×
676
            facets["domain"] = aws_keys.split(".")[3]
×
677

678
        title = data.get("title")
×
679
        if title:
×
680
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
681
            if regridded_domain_found:
×
682
                facets["domain"] = regridded_domain_found.group()
×
683

684
        # The logic here is awful, but the information is bad to begin with.
685
        driving_model = ""
×
686
        driving_institution = ""
×
687

688
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
689
        if VALIDATION_ENABLED:
×
690
            if driving_institution_parts[0] in INSTITUTIONS:
×
691
                driving_institution = driving_institution_parts[0]
×
692
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
693
                driving_institution = "-".join(driving_institution_parts[:2])
×
694
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
695
                driving_institution = "-".join(driving_institution_parts[:3])
×
696
        else:
NEW
697
            logger.warning("CORDEX Metadata validation checks require PyESSV. Driving institution cannot be determined.")
×
UNCOV
698
            driving_model = data["driving_model_id"]
×
699

700
        if data["driving_model_id"].startswith("GFDL"):
×
701
            driving_institution = "NOAA-GFDL"
×
702
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
703
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
704
            driving_institution = "MPI-M"
×
705
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
706
        elif data["driving_model_id"].startswith("HadGEM2"):
×
707
            driving_institution = "MOHC"
×
708
            driving_model = f"MOHC-{data['driving_model_id']}"
×
709
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
710
            driving_institution = "CNRM-CERFACS"
×
711
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
712

713
        elif VALIDATION_ENABLED and not driving_institution:
×
NEW
714
            raise DecoderError(f"driving_institution (from driving_model_id: `{data['driving_model_id']}`) is not valid.")
×
715

716
        facets["driving_institution"] = driving_institution.strip()
×
717
        if driving_model:
×
718
            facets["driving_model"] = driving_model.strip()
×
719
        else:
720
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
721

722
        facets["format"] = "netcdf"
×
723

724
        if data["institute_id"].strip() == "Our.":
×
725
            facets["institution"] = "Ouranos"
×
726
        else:
727
            facets["institution"] = data["institute_id"].strip()
×
728

729
        facets["processing_level"] = "raw"
×
730
        facets["source"] = data["model_id"]
×
731
        facets["type"] = "simulation"
×
732
        facets["variable"] = variable
×
733

734
        facets.update(cls._decode_version(data=data, file=file))
×
735
        facets.update(cls._decode_hour_of_day_info(file=file))
×
736

737
        try:
×
NEW
738
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
739
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
740
            facets["date_start"] = date_parser(date)
×
741
            facets["date_end"] = date_parser(date, end_of_period=True)
×
742
        except DecoderError:  # noqa: S110
×
743
            pass
×
744

745
        try:
×
746
            facets["experiment"] = data["experiment_id"].strip()
×
747
        except KeyError:
×
748
            facets["experiment"] = data["driving_experiment_name"].strip()
×
749

750
        try:
×
751
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
752
                facets["member"] = data.get(potential_member)
×
753
                if facets["member"] == "N/A":
×
754
                    raise KeyError()
×
755
                else:
756
                    break
×
757
            if facets["member"] is None:
×
758
                raise KeyError()
×
759
        except KeyError:
×
760
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
761

762
        return facets
×
763

764
    @classmethod
7✔
765
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
7✔
766
        facets = dict()
×
767
        try:
×
768
            variable, date, data = cls._from_dataset(file=file)
×
769
        except DecoderError:
×
770
            return facets
×
771

772
        facets["activity"] = "ISIMIP"
×
773
        facets["mip_era"] = data["project_id"]
×
774
        facets["date"] = date
×
775
        facets["domain"] = "global"
×
776
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
777
        facets["experiment"] = data["experiment_id"]
×
778
        facets["format"] = "netcdf"
×
779
        facets["impact_model"] = data["impact_model_id"]
×
780
        facets["institution"] = data["institute_id"]
×
781
        facets["member"] = data["driving_model_ensemble_member"]
×
782
        facets["modeling_realm"] = data["modeling_realm"]
×
783
        facets["social_forcing_id"] = data["social_forcing_id"]
×
784
        facets["source"] = data["model_id"]
×
785
        facets["type"] = "simulation"
×
786
        facets["variable"] = variable
×
787

788
        facets.update(cls._decode_version(data=data, file=file))
×
789
        facets.update(cls._decode_hour_of_day_info(file=file))
×
790

791
        try:
×
792
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
NEW
793
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
UNCOV
794
            facets["date_start"] = date_parser(date)
×
795
            facets["date_end"] = date_parser(date, end_of_period=True)
×
796
        except DecoderError:  # noqa: S110
×
797
            pass
×
798

799
        return facets
×
800

801
    @classmethod
7✔
802
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
7✔
803
        facets = dict()
×
804
        try:
×
805
            variable, date, data = cls._from_dataset(file=file)
×
806
        except DecoderError:
×
807
            return facets
×
808

809
        facets["experiment"] = data["scenario"]
×
NEW
810
        facets["activity"] = "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
×
UNCOV
811
        facets["institution"] = data["cmip6_institution_id"]
×
812
        facets["member"] = data["variant_label"]
×
813
        facets["processing_level"] = "biasadjusted"
×
814
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
815
        facets["bias_adjust_institution"] = "NASA"
×
816
        facets["mip_era"] = "CMIP6"
×
817
        facets["source"] = data["cmip6_source_id"]
×
818
        facets["type"] = "simulation"
×
819
        facets["variable"] = variable
×
820
        facets.update(cls._decode_version(data=data, file=file))
×
821
        facets.update(cls._decode_hour_of_day_info(file=file))
×
822

823
        try:
×
NEW
824
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
825
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
826
            facets["date_start"] = date_parser(date)
×
827
            facets["date_end"] = date_parser(date, end_of_period=True)
×
828
        except DecoderError:  # noqa: S110
×
829
            pass
×
830

831
        return facets
×
832

833
    @classmethod
7✔
834
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
7✔
835
        facets = dict()
×
836
        try:
×
837
            variable, date, data = cls._from_dataset(file=file)
×
838
        except DecoderError:
×
839
            return facets
×
840

841
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
842
        facets["processing_level"] = "biasadjusted"
×
843
        facets["version"] = "1.0.0"
×
844
        facets["domain"] = "NAM"
×
845
        for f in [
×
846
            "experiment",
847
            "activity",
848
            "institution",
849
            "member",
850
            "bias_adjust_institution",
851
            "mip_era",
852
            "source",
853
            "type",
854
        ]:
855
            facets[f] = data[f"cat:{f}"]
×
856
        facets["variable"] = variable
×
857
        # facets.update(cls._decode_version(data=data, file=file))
858
        facets.update(cls._decode_hour_of_day_info(file=file))
×
859

860
        try:
×
NEW
861
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
862
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
863
            facets["date_start"] = date_parser(date)
×
864
            facets["date_end"] = date_parser(date, end_of_period=True)
×
865
        except DecoderError:  # noqa: S110
×
866
            pass
×
867

868
        return facets
×
869

870
    @classmethod
7✔
871
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
7✔
872
        facets = dict()
×
873
        try:
×
874
            variable, date, data = cls._from_dataset(file=file)
×
875
        except DecoderError:
×
876
            return facets
×
877

878
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
879
        facets["processing_level"] = "biasadjusted"
×
880
        facets["version"] = "1.0.0"
×
881
        facets["domain"] = "NAM"
×
882
        for f in [
×
883
            "experiment",
884
            "activity",
885
            "institution",
886
            "member",
887
            "bias_adjust_institution",
888
            "mip_era",
889
            "source",
890
            "type",
891
        ]:
892
            facets[f] = data[f"cat:{f}"]
×
893
        facets["variable"] = variable
×
894
        # facets.update(cls._decode_version(data=data, file=file))
895
        facets.update(cls._decode_hour_of_day_info(file=file))
×
896

897
        try:
×
NEW
898
            facets["frequency"] = cls._decode_time_info(data=data, file=file, field="frequency")
×
NEW
899
            facets["timedelta"] = cls._decode_time_info(term=facets["frequency"], field="timedelta")
×
900
            facets["date_start"] = date_parser(date)
×
901
            facets["date_end"] = date_parser(date, end_of_period=True)
×
902
        except DecoderError:  # noqa: S110
×
903
            pass
×
904

905
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc