• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 17135863835

21 Aug 2025 06:38PM UTC coverage: 18.726% (-0.2%) from 18.928%
17135863835

push

github

web-flow
No longer initialize logging within library (#264)

### What kind of change does this PR introduce?

* Adjusts logging so that submodules do not initialize the logging
configuration.
* Sets all modules using logging to log relative to the `miranda`
library (never `root`).
* Adds a missing function needed for a submodule that was removed in an
earlier PR.

### Does this PR introduce a breaking change?

Yes. If you rely on the logging messages, you need to initialize the
logger in your scripts as follows:
```python
import logging.config

from miranda.scripting import LOGGING_CONFIG

logging.config.dictConfig(LOGGING_CONFIG)
```

There might be a much easier way to get this up and running, but with
these changes, we no longer need to worry about submodules setting this
accidentally.

### Other information:

There was a missing function (`group_by_length`) that was entirely
missing. This wasn't raising errors, as the module that relied on it
wasn't being loaded on import.

48 of 231 new or added lines in 28 files covered. (20.78%)

3 existing lines in 3 files now uncovered.

1199 of 6403 relevant lines covered (18.73%)

1.3 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.64
/src/miranda/decode/_decoder.py
1
from __future__ import annotations
7✔
2

3
import logging
7✔
4
import multiprocessing as mp
7✔
5
import os
7✔
6
import re
7✔
7
import warnings
7✔
8
from functools import partial
7✔
9
from os import PathLike
7✔
10
from pathlib import Path
7✔
11
from types import GeneratorType
7✔
12

13
import netCDF4 as nc  # noqa
7✔
14
import pandas as pd
7✔
15
import schema
7✔
16
import xarray as xr
7✔
17
import zarr
7✔
18
from pandas._libs.tslibs import NaTType  # noqa
7✔
19

20
from miranda.convert.utils import date_parser, find_version_hash  # noqa
7✔
21
from miranda.cv import VALIDATION_ENABLED
7✔
22
from miranda.scripting import LOGGING_CONFIG
7✔
23
from miranda.units import check_time_frequency
7✔
24

25
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
7✔
26

27
if VALIDATION_ENABLED:
7✔
28
    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
7✔
29
    from miranda.validate import FACETS_SCHEMA  # noqa
7✔
30

31

32
logger = logging.getLogger("miranda.decode.decoder")
7✔
33

34
__all__ = [
7✔
35
    "Decoder",
36
    "guess_project",
37
]
38

39

40
def guess_project(file: os.PathLike | str) -> str:
7✔
41
    """Guess the name of the project
42

43
    Parameters
44
    ----------
45
    file : str or os.PathLike
46

47
    Returns
48
    -------
49
    str
50
    """
51
    file_name = Path(file).stem
×
52

53
    potential_names = file_name.split("_")
×
54
    if VALIDATION_ENABLED:
×
55
        for project, models in PROJECT_MODELS.items():
×
56
            if any([model in potential_names for model in models]):
×
57
                return project
×
58
        raise DecoderError(
×
59
            f"Unable to determine project from file name: '{file_name}'."
60
        )
61
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
62

63

64
class Decoder:
7✔
65
    project = None
7✔
66
    guess = False
7✔
67
    _file_facets = dict()
7✔
68

69
    def __init__(self, project: str | None):
7✔
70
        self.project = project
×
71

72
    @staticmethod
7✔
73
    def _decoder(
7✔
74
        d: dict,
75
        fail_early: bool,
76
        proj: str,
77
        guess: bool,
78
        lock: mp.Lock,
79
        file: str | Path,
80
    ) -> None:
81
        if proj is None:
×
82
            if guess:
×
83
                try:
×
84
                    proj = guess_project(file)
×
85
                except DecoderError:
×
86
                    print(
×
87
                        "Unable to determine 'activity': Signature for 'activity' must be set manually for file: "
88
                        f"{file}."
89
                    )
90
                    if fail_early:
×
91
                        raise
×
92
            else:
93
                proj = "converted"
×
94

95
        decode_function_name = f"decode_{proj.lower().replace('-', '_')}"
×
96
        try:
×
97
            with lock:
×
98
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
99
                if fail_early:
×
100
                    if VALIDATION_ENABLED:
×
101
                        FACETS_SCHEMA.validate(_deciphered)
×
102
                    else:
103
                        print(
×
104
                            "Validation requires pyessv-archive source files. Skipping validation checks."
105
                        )
106
                print(
×
107
                    f"Deciphered the following from {Path(file).name}:\n"
108
                    f"{_deciphered.items()}"
109
                )
110
                d[file] = _deciphered
×
111

112
        except (AttributeError, NotImplementedError):
×
113
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
114
            raise
×
115
        except schema.SchemaError as e:
×
116
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
117

118
    def decode(
7✔
119
        self,
120
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
121
        chunks: int | None = None,
122
        raise_error: bool = False,
123
    ) -> None:
124
        """Decode facets from file or list of files.
125

126
        Parameters
127
        ----------
128
        files : str or Path or list of str or Path or generator
129
            The files to decode.
130
        chunks : int, optional
131
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
132
        raise_error : bool
133
            Whether to raise an error if a file cannot be decoded.
134
        """
135
        if isinstance(files, (str, os.PathLike)):
×
136
            files = [files]
×
137

138
        if chunks is None and isinstance(files, list):
×
139
            if len(files) >= 10:
×
140
                chunk_size = 10
×
141
            elif 1 <= len(files) < 10:
×
142
                chunk_size = len(files)
×
143
            else:
144
                raise ValueError("No file entries found.")
×
145
        elif isinstance(files, GeneratorType):
×
146
            chunk_size = 10
×
147
        else:
148
            chunk_size = chunks
×
149

150
        if self.project is None:
×
151
            warnings.warn(
×
152
                "The decoder 'project' is not set; Decoding step will be much slower."
153
            )
154
        else:
155
            msg = f"Deciphering metadata with project = '{self.project}'"
×
NEW
156
            logger.info(msg)
×
157

158
        with mp.Manager() as manager:
×
159
            _file_facets = manager.dict()
×
160
            lock = manager.Lock()
×
161
            func = partial(
×
162
                self._decoder, _file_facets, raise_error, self.project, self.guess, lock
163
            )
164

165
            with mp.Pool() as pool:
×
166
                pool.imap(func, files, chunksize=chunk_size)
×
167
                pool.close()
×
168
                pool.join()
×
169

170
            self._file_facets.update(_file_facets)
×
171

172
    def facets_table(self):
7✔
173
        raise NotImplementedError()
×
174

175
    def file_facets(self) -> dict[os.PathLike, dict]:
7✔
176
        return self._file_facets
×
177

178
    @classmethod
7✔
179
    def _from_dataset(cls, file: Path | str) -> (str, str, dict):
7✔
180
        file_name = Path(file).stem
×
181

182
        try:
×
183
            variable_name = cls._decode_primary_variable(file)
×
184
        except DecoderError:
×
185
            msg = f"Unable to open dataset: {file.name}"
×
NEW
186
            logger.error(msg)
×
187
            raise
×
188

189
        datetimes = file_name.split("_")[-1]
×
190

191
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
192
            with nc.Dataset(file, mode="r") as ds:
×
193
                data = dict()
×
194
                for k in ds.ncattrs():
×
195
                    data[k] = getattr(ds, k)
×
196
        elif file.is_dir() and file.suffix == ".zarr":
×
197
            with zarr.open(file, mode="r") as ds:
×
198
                data = ds.attrs.asdict()
×
199
        else:
200
            raise DecoderError(f"Unable to read dataset: `{file.name}`.")
×
201
        return variable_name, datetimes, data
×
202

203
    @staticmethod
7✔
204
    def _decode_primary_variable(file: Path) -> str:
7✔
205
        """Attempts to find the primary variable of a netCDF
206

207
        Parameters
208
        ----------
209
        file: Path
210

211
        Returns
212
        -------
213
        str
214
        """
215
        dimsvar_dict = dict()
×
216
        coords = (
×
217
            "height",
218
            "lat",
219
            "latitude",
220
            "lev",
221
            "level",
222
            "lon",
223
            "longitude",
224
            "rlat",
225
            "rlon",
226
            "rotated_pole",
227
            "time",
228
        )
229
        try:
×
230
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
231
                with nc.Dataset(file, mode="r") as ds:
×
232
                    for var_name, var_attrs in ds.variables.items():
×
233
                        dimsvar_dict[var_name] = {
×
234
                            k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()
235
                        }
236
                for k in dimsvar_dict.keys():
×
237
                    if not str(k).startswith(coords) and k in file.stem:
×
238
                        return str(k)
×
239

240
            elif file.is_dir() and file.suffix == ".zarr":
×
241
                with zarr.open(str(file), mode="r") as ds:
×
242
                    for k in ds.array_keys():
×
243
                        if not str(k).startswith(coords) and k in file.stem:
×
244
                            return str(k)
×
245
            else:
246
                raise NotImplementedError()
×
247
        except ValueError:
×
248
            raise DecoderError()
×
249

250
    @staticmethod
7✔
251
    def _decode_hour_of_day_info(
7✔
252
        file: PathLike | str,
253
    ) -> dict:
254
        """Decode hour of day information.
255

256
        Parameters
257
        ----------
258
        file : Path or str
259

260
        Returns
261
        -------
262
        dict
263
        """
264
        if isinstance(file, str):
×
265
            file = Path(file)
×
266

267
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
268
            with nc.Dataset(file, mode="r") as ds:
×
269
                if "time" in ds.variables.keys():
×
270
                    hour = nc.num2date(
×
271
                        ds["time"][0], ds["time"].units, ds["time"].calendar
272
                    ).hour
273
                else:
274
                    hour = None
×
275
            return dict(hour_of_day=hour)
×
276

277
        elif file.is_dir() and file.suffix == ".zarr":
×
278
            warnings.warn("This is not currently implemented")
×
279

280
            # with zarr.open(str(file), mode="r") as ds:
281
            #     if "time" in ds.array_keys():
282
            #         pass
283

284
            return dict()
×
285

286
        else:
287
            raise NotImplementedError()
×
288

289
    @staticmethod
7✔
290
    def _decode_time_info(  # noqa: C901
7✔
291
        file: PathLike | str | list[str] | None = None,
292
        data: dict | None = None,
293
        term: str | None = None,
294
        *,
295
        field: str | None = None,
296
    ) -> str | NaTType:
297
        """Decode time information.
298

299
        Parameters
300
        ----------
301
        file : os.PathLike or str, optional
302
        data : dict, optional
303
        term : str
304
        field : {"timedelta", "frequency"}
305

306
        Returns
307
        -------
308
        str or NaTType
309
        """
310
        if not file and not data and not term:
×
311
            raise ValueError("Nothing passed to parse time info from.")
×
312

313
        if field == "frequency":
×
314
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
315
        elif field == "timedelta":
×
316
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
317
        else:
318
            raise NotImplementedError()
×
319

320
        if term:
×
321
            if term in ["fx", "fixed"]:
×
322
                if field == "timedelta":
×
323
                    return pd.NaT
×
324
                return "fx"
×
325
            return pd.to_timedelta(time_dictionary[term])
×
326

327
        if data and not file:
×
328
            potential_time = data.get("frequency")
×
329
            if not potential_time:
×
330
                if hasattr(data, "time"):
×
331
                    time_units = data["time"].units
×
332
                    potential_time = time_units.split()[0]
×
333
                else:
334
                    msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
335

NEW
336
                    logger.warning(msg)
×
337
                    potential_time = "fx"
×
338
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
339
                msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
NEW
340
                logger.warning(msg)
×
341
                if field == "frequency":
×
342
                    return "fx"
×
343
                if field == "timedelta":
×
344
                    return pd.NaT
×
345
                raise ValueError()
×
346

347
            if field == "timedelta":
×
348
                if potential_time in ["fx", "fixed"]:
×
349
                    return pd.NaT
×
350
                return pd.to_timedelta(time_dictionary[potential_time])
×
351
            return time_dictionary[potential_time]
×
352

353
        if file and not data:
×
354
            for delimiter in ["_", "."]:
×
355
                file_parts = Path(file).stem.split(delimiter)
×
356
                potential_times = [
×
357
                    segment
358
                    for segment in file_parts
359
                    if segment in time_dictionary.keys()
360
                ]
361
                if potential_times:
×
362
                    if potential_times[0] in ["fx", "fixed"]:
×
363
                        if field == "frequency":
×
364
                            return "fx"
×
365
                        if field == "timedelta":
×
366
                            return pd.NaT
×
367
                        raise ValueError(f"Field `{field}` not supported.")
×
368
                    if field == "timedelta":
×
369
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
370
                    return time_dictionary[potential_times[0]]
×
371

372
        if file and data:
×
373
            for delimiter in ["_", "."]:
×
374
                file_parts = Path(file).stem.split(delimiter)
×
375
                potential_times = [
×
376
                    segment
377
                    for segment in file_parts
378
                    if segment in time_dictionary.keys()
379
                ]
380
                potential_time = data.get("frequency", "")
×
381
                if potential_time == "":
×
382
                    if hasattr(data, "time"):
×
383
                        time_units = data["time"].units
×
384
                        potential_time = time_units.split()[0]
×
385
                    else:
386
                        msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
387

NEW
388
                        logger.warning(msg)
×
389
                        potential_time = "fx"
×
390
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
391
                    msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
392

NEW
393
                    logger.warning(msg)
×
394
                    if "fx" in file_parts or "fixed" in file_parts:
×
395
                        if field == "frequency":
×
396
                            return "fx"
×
397
                        if field == "timedelta":
×
398
                            return pd.NaT
×
399
                        raise ValueError(f"Field `{field}` not supported.")
×
400

401
                if potential_time in potential_times:
×
402
                    return time_dictionary[potential_time]
×
403
                elif potential_times:
×
404
                    break
×
405

406
            msg = (
×
407
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
408
                "Performing more rigorous frequency checks."
409
            )
NEW
410
            logger.warning(msg)
×
411
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
412
                engine = "h5netcdf"
×
413
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
414
                engine = "zarr"
×
415
            else:
416
                raise DecoderError(
×
417
                    f"File is not valid netcdf or zarr: {Path(file).name}"
418
                )
419

420
            _ds = xr.open_dataset(
×
421
                file,
422
                engine=engine,
423
                drop_variables="time_bnds",
424
            )
425
            if not hasattr(_ds, "time"):
×
NEW
426
                logger.warning(
×
427
                    "Dataset does not contain time array. Assuming fixed variable."
428
                )
429
                if field == "frequency":
×
430
                    return "fx"
×
431
                if field == "timedelta":
×
432
                    return pd.NaT
×
433
                raise ValueError(f"Field `{field}` not supported.")
×
434
            else:
435
                _, found_freq = check_time_frequency(_ds.time)
×
436

437
            if found_freq in potential_times:
×
438
                msg = (
×
439
                    "Time frequency found in dataset on analysis was found in filename. "
440
                    f"Metadata for `{Path(file).name} is probably incorrect. "
441
                    f"Basing fields on `{found_freq}`."
442
                )
NEW
443
                logger.warning(msg)
×
444
                return time_dictionary[found_freq]
×
445
            elif found_freq in ["month", "mon"]:
×
446
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
447
                    if f in potential_times:
×
448
                        msg = f"Month-like time frequency found in dataset on analysis was found in filename. Basing fields on `{f}`."
×
NEW
449
                        logger.warning(msg)
×
450
                        return time_dictionary[f]
×
451
            else:
452
                msg = (
×
453
                    "Time frequency found in dataset on analysis was not found in filename. "
454
                    f"Basing fields on `{found_freq}`."
455
                )
NEW
456
                logger.warning(msg)
×
457
                return time_dictionary[found_freq]
×
458
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
459

460
    @staticmethod
7✔
461
    def _decode_version(file: PathLike | str, data: dict) -> dict:
7✔
462
        """Decode version information.
463

464
        Parameters
465
        ----------
466
        file : os.PathLike or str
467
        data : dict
468

469
        Returns
470
        -------
471
        dict
472
        """
473
        version_info = dict()
×
474
        try:
×
475
            version_info["version"] = data["version"]
×
476
        except KeyError:
×
477
            possible_version = Path(file).parent
×
478
            if re.match(r"^[vV]\d+", possible_version.name):
×
479
                version_info["version"] = possible_version.name
×
480
            else:
481
                possible_version_signature = possible_version.glob(
×
482
                    f"{Path(file).stem}.v*"
483
                )
484
                for sig in possible_version_signature:
×
485
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
486
                    if found_version:
×
487
                        version_info["version"] = found_version.group()
×
488
                        version_info["sha256sum"] = sig.open().read()
×
489
                        break
×
490
                else:
491
                    version_info["version"] = "vNotFound"
×
492
        return version_info
×
493

494
    @classmethod
7✔
495
    def decode_converted(cls, file: PathLike | str) -> dict:
7✔
496
        """Decode converted data.
497

498
        Parameters
499
        ----------
500
        file : os.PathLike or str
501

502
        Returns
503
        -------
504
        dict
505
        """
506
        facets = dict()
×
507
        try:
×
508
            variable, date, data = cls._from_dataset(file=file)
×
509
        except DecoderError:
×
510
            return facets
×
511

512
        facets.update(data)
×
513
        del facets["history"]
×
514

515
        facets["date"] = date
×
516

517
        file_format = data.get("output_format")
×
518
        if file_format:
×
519
            facets["format"] = file_format
×
520
        elif "format" in data:
×
521
            facets["format"] = data["format"]
×
522
        elif Path(file).suffix in [".nc", ".nc4"]:
×
523
            facets["format"] = "nc"
×
524
        elif Path(file).suffix in [".zarr"]:
×
525
            facets["format"] = "zarr"
×
526
        facets["variable"] = variable
×
527

528
        facets.update(cls._decode_version(data=data, file=file))
×
529
        facets.update(cls._decode_hour_of_day_info(file=file))
×
530

531
        try:
×
532
            if "frequency" not in facets:
×
533
                facets["timedelta"] = cls._decode_time_info(
×
534
                    data=data, file=file, field="frequency"
535
                )
536
            facets["timedelta"] = cls._decode_time_info(
×
537
                term=facets["frequency"], field="timedelta"
538
            )
539
            facets["date_start"] = date_parser(date)
×
540
            facets["date_end"] = date_parser(date, end_of_period=True)
×
541
        except DecoderError:  # noqa: S110
×
542
            pass
×
543

544
        return facets
×
545

546
    @staticmethod
7✔
547
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
7✔
548
        raise NotImplementedError()
×
549

550
    @staticmethod
7✔
551
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
7✔
552
        raise NotImplementedError()
×
553

554
    @staticmethod
7✔
555
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
7✔
556
        raise NotImplementedError()
×
557

558
    @classmethod
7✔
559
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
7✔
560
        if "Derived" in Path(file).parents:
×
561
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
562

563
        facets = dict()
×
564
        try:
×
565
            variable, date, data = cls._from_dataset(file=file)
×
566
        except DecoderError:
×
567
            return facets
×
568

569
        facets["activity"] = data["activity_id"]
×
570
        facets["mip_era"] = data["project_id"]
×
571
        facets["bias_adjust_institution"] = "PCIC"
×
572
        facets["date"] = date
×
573
        facets["domain"] = data["domain"]
×
574
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
575
        facets["format"] = "netcdf"
×
576
        facets["institution"] = data["GCM__institution_id"]
×
577
        facets["member"] = (
×
578
            f"r{data['GCM__realization_index']}"
579
            f"i{data['GCM__initialization_index']}"
580
            f"p{data['GCM__physics_index']}"
581
            f"f{data['GCM__forcing_index']}"
582
        )
583
        facets["processing_level"] = "biasadjusted"
×
584
        facets["bias_adjust_project"] = "CanDCS-U6"
×
585
        facets["source"] = data["GCM__source_id"]
×
586
        facets["type"] = "simulation"
×
587
        facets["variable"] = variable
×
588

589
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
590
        if facets["version"] is None:
×
591
            facets.update(find_version_hash(file=file))
×
592

593
        facets.update(cls._decode_hour_of_day_info(file=file))
×
594

595
        try:
×
596
            facets["frequency"] = cls._decode_time_info(
×
597
                data=data, file=file, field="frequency"
598
            )
599
            facets["timedelta"] = cls._decode_time_info(
×
600
                term=facets["frequency"], field="timedelta"
601
            )
602
            facets["date_start"] = date_parser(date)
×
603
            facets["date_end"] = date_parser(date, end_of_period=True)
×
604
        except DecoderError:  # noqa: S110
×
605
            pass
×
606

607
        return facets
×
608

609
    @classmethod
7✔
610
    def decode_cmip6(cls, file: PathLike | str) -> dict:
7✔
611
        facets = dict()
×
612
        try:
×
613
            variable, date, data = cls._from_dataset(file=file)
×
614
        except DecoderError:
×
615
            return facets
×
616

617
        facets["activity"] = data["activity_id"]
×
618
        facets["date"] = date
×
619
        facets["domain"] = "global"
×
620
        facets["experiment"] = data["experiment_id"]
×
621
        facets["format"] = "netcdf"
×
622
        facets["grid_label"] = data["grid_label"]
×
623
        facets["institution"] = data["institution_id"]
×
624
        facets["member"] = data["variant_label"]
×
625
        facets["modeling_realm"] = data["realm"]
×
626
        facets["processing_level"] = "raw"
×
627
        facets["mip_era"] = data["mip_era"]
×
628
        facets["source"] = data["source_id"]
×
629
        facets["type"] = "simulation"
×
630
        facets["variable"] = variable
×
631
        facets.update(cls._decode_version(data=data, file=file))
×
632
        facets.update(cls._decode_hour_of_day_info(file=file))
×
633

634
        try:
×
635
            facets["frequency"] = cls._decode_time_info(
×
636
                data=data, file=file, field="frequency"
637
            )
638
            facets["timedelta"] = cls._decode_time_info(
×
639
                term=facets["frequency"], field="timedelta"
640
            )
641
            facets["date_start"] = date_parser(date)
×
642
            facets["date_end"] = date_parser(date, end_of_period=True)
×
643
        except DecoderError:  # noqa: S110
×
644
            pass
×
645

646
        return facets
×
647

648
    @classmethod
7✔
649
    def decode_cmip5(cls, file: PathLike | str) -> dict:
7✔
650
        facets = dict()
×
651
        try:
×
652
            variable, date, data = cls._from_dataset(file=file)
×
653
        except DecoderError:
×
654
            return facets
×
655

656
        facets["activity"] = "CMIP"
×
657
        facets["date"] = date
×
658
        facets["domain"] = "global"
×
659
        facets["experiment"] = data["experiment_id"]
×
660
        facets["format"] = "netcdf"
×
661
        facets["institution"] = data["institute_id"]
×
662
        facets["member"] = data["parent_experiment_rip"]
×
663
        facets["modeling_realm"] = data["modeling_realm"]
×
664
        facets["processing_level"] = "raw"
×
665
        facets["mip_era"] = data["project_id"]
×
666
        facets["source"] = data["model_id"]
×
667
        facets["type"] = "simulation"
×
668
        facets["variable"] = variable
×
669
        facets.update(cls._decode_version(data=data, file=file))
×
670
        facets.update(cls._decode_hour_of_day_info(file=file))
×
671

672
        try:
×
673
            facets["frequency"] = cls._decode_time_info(
×
674
                data=data, file=file, field="frequency"
675
            )
676
            facets["timedelta"] = cls._decode_time_info(
×
677
                term=facets["frequency"], field="timedelta"
678
            )
679
            facets["date_start"] = date_parser(date)
×
680
            facets["date_end"] = date_parser(date, end_of_period=True)
×
681
        except DecoderError:  # noqa: S110
×
682
            pass
×
683

684
        return facets
×
685

686
    @classmethod
7✔
687
    def decode_cordex(cls, file: PathLike | str) -> dict:
7✔
688
        facets = dict()
×
689
        try:
×
690
            variable, date, data = cls._from_dataset(file=file)
×
691
        except DecoderError:
×
692
            return dict()
×
693

694
        # FIXME: What to do about our internal data that breaks all established conventions?
695
        facets["activity"] = "CORDEX"
×
696

697
        if data.get("project_id") == "" or data.get("project_id") is None:
×
698
            facets["mip_era"] = "internal"
×
699
        elif data.get("project_id") == "CORDEX":
×
700
            facets["mip_era"] = "CMIP5"
×
701

702
        if date == "r0i0p0":
×
703
            facets["date"] = "fx"
×
704
        else:
705
            facets["date"] = date
×
706

707
        domain = data.get("CORDEX_domain")
×
708
        if domain:
×
709
            facets["domain"] = domain.strip()
×
710
        else:
711
            domain = data.get("ouranos_domain_name")
×
712
            if domain:
×
713
                facets["domain"] = domain.strip()
×
714
            else:
715
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
NEW
716
                logger.error(msg)
×
717
                raise NotImplementedError(msg)
×
718

719
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
720
        aws_keys = data.get("intake_esm_dataset_key")
×
721
        if aws_keys:
×
722
            facets["domain"] = aws_keys.split(".")[3]
×
723

724
        title = data.get("title")
×
725
        if title:
×
726
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
727
            if regridded_domain_found:
×
728
                facets["domain"] = regridded_domain_found.group()
×
729

730
        # The logic here is awful, but the information is bad to begin with.
731
        driving_model = ""
×
732
        driving_institution = ""
×
733

734
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
735
        if VALIDATION_ENABLED:
×
736
            if driving_institution_parts[0] in INSTITUTIONS:
×
737
                driving_institution = driving_institution_parts[0]
×
738
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
739
                driving_institution = "-".join(driving_institution_parts[:2])
×
740
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
741
                driving_institution = "-".join(driving_institution_parts[:3])
×
742
        else:
NEW
743
            logger.warning(
×
744
                "CORDEX Metadata validation checks require PyESSV. "
745
                "Driving institution cannot be determined."
746
            )
747
            driving_model = data["driving_model_id"]
×
748

749
        if data["driving_model_id"].startswith("GFDL"):
×
750
            driving_institution = "NOAA-GFDL"
×
751
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
752
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
753
            driving_institution = "MPI-M"
×
754
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
755
        elif data["driving_model_id"].startswith("HadGEM2"):
×
756
            driving_institution = "MOHC"
×
757
            driving_model = f"MOHC-{data['driving_model_id']}"
×
758
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
759
            driving_institution = "CNRM-CERFACS"
×
760
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
761

762
        elif VALIDATION_ENABLED and not driving_institution:
×
763
            raise DecoderError(
×
764
                "driving_institution (from driving_model_id: "
765
                f"`{data['driving_model_id']}`) is not valid."
766
            )
767

768
        facets["driving_institution"] = driving_institution.strip()
×
769
        if driving_model:
×
770
            facets["driving_model"] = driving_model.strip()
×
771
        else:
772
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
773

774
        facets["format"] = "netcdf"
×
775

776
        if data["institute_id"].strip() == "Our.":
×
777
            facets["institution"] = "Ouranos"
×
778
        else:
779
            facets["institution"] = data["institute_id"].strip()
×
780

781
        facets["processing_level"] = "raw"
×
782
        facets["source"] = data["model_id"]
×
783
        facets["type"] = "simulation"
×
784
        facets["variable"] = variable
×
785

786
        facets.update(cls._decode_version(data=data, file=file))
×
787
        facets.update(cls._decode_hour_of_day_info(file=file))
×
788

789
        try:
×
790
            facets["frequency"] = cls._decode_time_info(
×
791
                data=data, file=file, field="frequency"
792
            )
793
            facets["timedelta"] = cls._decode_time_info(
×
794
                term=facets["frequency"], field="timedelta"
795
            )
796
            facets["date_start"] = date_parser(date)
×
797
            facets["date_end"] = date_parser(date, end_of_period=True)
×
798
        except DecoderError:  # noqa: S110
×
799
            pass
×
800

801
        try:
×
802
            facets["experiment"] = data["experiment_id"].strip()
×
803
        except KeyError:
×
804
            facets["experiment"] = data["driving_experiment_name"].strip()
×
805

806
        try:
×
807
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
808
                facets["member"] = data.get(potential_member)
×
809
                if facets["member"] == "N/A":
×
810
                    raise KeyError()
×
811
                else:
812
                    break
×
813
            if facets["member"] is None:
×
814
                raise KeyError()
×
815
        except KeyError:
×
816
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
817

818
        return facets
×
819

820
    @classmethod
7✔
821
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
7✔
822
        facets = dict()
×
823
        try:
×
824
            variable, date, data = cls._from_dataset(file=file)
×
825
        except DecoderError:
×
826
            return facets
×
827

828
        facets["activity"] = "ISIMIP"
×
829
        facets["mip_era"] = data["project_id"]
×
830
        facets["date"] = date
×
831
        facets["domain"] = "global"
×
832
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
833
        facets["experiment"] = data["experiment_id"]
×
834
        facets["format"] = "netcdf"
×
835
        facets["impact_model"] = data["impact_model_id"]
×
836
        facets["institution"] = data["institute_id"]
×
837
        facets["member"] = data["driving_model_ensemble_member"]
×
838
        facets["modeling_realm"] = data["modeling_realm"]
×
839
        facets["social_forcing_id"] = data["social_forcing_id"]
×
840
        facets["source"] = data["model_id"]
×
841
        facets["type"] = "simulation"
×
842
        facets["variable"] = variable
×
843

844
        facets.update(cls._decode_version(data=data, file=file))
×
845
        facets.update(cls._decode_hour_of_day_info(file=file))
×
846

847
        try:
×
848
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
849
            facets["timedelta"] = cls._decode_time_info(
×
850
                term=facets["frequency"], field="timedelta"
851
            )
852
            facets["date_start"] = date_parser(date)
×
853
            facets["date_end"] = date_parser(date, end_of_period=True)
×
854
        except DecoderError:  # noqa: S110
×
855
            pass
×
856

857
        return facets
×
858

859
    @classmethod
7✔
860
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
7✔
861
        facets = dict()
×
862
        try:
×
863
            variable, date, data = cls._from_dataset(file=file)
×
864
        except DecoderError:
×
865
            return facets
×
866

867
        facets["experiment"] = data["scenario"]
×
868
        facets["activity"] = (
×
869
            "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
870
        )
871
        facets["institution"] = data["cmip6_institution_id"]
×
872
        facets["member"] = data["variant_label"]
×
873
        facets["processing_level"] = "biasadjusted"
×
874
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
875
        facets["bias_adjust_institution"] = "NASA"
×
876
        facets["mip_era"] = "CMIP6"
×
877
        facets["source"] = data["cmip6_source_id"]
×
878
        facets["type"] = "simulation"
×
879
        facets["variable"] = variable
×
880
        facets.update(cls._decode_version(data=data, file=file))
×
881
        facets.update(cls._decode_hour_of_day_info(file=file))
×
882

883
        try:
×
884
            facets["frequency"] = cls._decode_time_info(
×
885
                data=data, file=file, field="frequency"
886
            )
887
            facets["timedelta"] = cls._decode_time_info(
×
888
                term=facets["frequency"], field="timedelta"
889
            )
890
            facets["date_start"] = date_parser(date)
×
891
            facets["date_end"] = date_parser(date, end_of_period=True)
×
892
        except DecoderError:  # noqa: S110
×
893
            pass
×
894

895
        return facets
×
896

897
    @classmethod
7✔
898
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
7✔
899
        facets = dict()
×
900
        try:
×
901
            variable, date, data = cls._from_dataset(file=file)
×
902
        except DecoderError:
×
903
            return facets
×
904

905
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
906
        facets["processing_level"] = "biasadjusted"
×
907
        facets["version"] = "1.0.0"
×
908
        facets["domain"] = "NAM"
×
909
        for f in [
×
910
            "experiment",
911
            "activity",
912
            "institution",
913
            "member",
914
            "bias_adjust_institution",
915
            "mip_era",
916
            "source",
917
            "type",
918
        ]:
919
            facets[f] = data[f"cat:{f}"]
×
920
        facets["variable"] = variable
×
921
        # facets.update(cls._decode_version(data=data, file=file))
922
        facets.update(cls._decode_hour_of_day_info(file=file))
×
923

924
        try:
×
925
            facets["frequency"] = cls._decode_time_info(
×
926
                data=data, file=file, field="frequency"
927
            )
928
            facets["timedelta"] = cls._decode_time_info(
×
929
                term=facets["frequency"], field="timedelta"
930
            )
931
            facets["date_start"] = date_parser(date)
×
932
            facets["date_end"] = date_parser(date, end_of_period=True)
×
933
        except DecoderError:  # noqa: S110
×
934
            pass
×
935

936
        return facets
×
937

938
    @classmethod
7✔
939
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
7✔
940
        facets = dict()
×
941
        try:
×
942
            variable, date, data = cls._from_dataset(file=file)
×
943
        except DecoderError:
×
944
            return facets
×
945

946
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
947
        facets["processing_level"] = "biasadjusted"
×
948
        facets["version"] = "1.0.0"
×
949
        facets["domain"] = "NAM"
×
950
        for f in [
×
951
            "experiment",
952
            "activity",
953
            "institution",
954
            "member",
955
            "bias_adjust_institution",
956
            "mip_era",
957
            "source",
958
            "type",
959
        ]:
960
            facets[f] = data[f"cat:{f}"]
×
961
        facets["variable"] = variable
×
962
        # facets.update(cls._decode_version(data=data, file=file))
963
        facets.update(cls._decode_hour_of_day_info(file=file))
×
964

965
        try:
×
966
            facets["frequency"] = cls._decode_time_info(
×
967
                data=data, file=file, field="frequency"
968
            )
969
            facets["timedelta"] = cls._decode_time_info(
×
970
                term=facets["frequency"], field="timedelta"
971
            )
972
            facets["date_start"] = date_parser(date)
×
973
            facets["date_end"] = date_parser(date, end_of_period=True)
×
974
        except DecoderError:  # noqa: S110
×
975
            pass
×
976

977
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc