• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / miranda / 17134907451

21 Aug 2025 05:56PM UTC coverage: 18.726% (-0.2%) from 18.928%
17134907451

Pull #264

github

web-flow
Merge f68200794 into 35f4337b1
Pull Request #264: No longer initialize logging within library

48 of 231 new or added lines in 28 files covered. (20.78%)

3 existing lines in 3 files now uncovered.

1199 of 6403 relevant lines covered (18.73%)

1.3 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

11.64
/src/miranda/decode/_decoder.py
1
from __future__ import annotations
7✔
2

3
import logging
7✔
4
import multiprocessing as mp
7✔
5
import os
7✔
6
import re
7✔
7
import warnings
7✔
8
from functools import partial
7✔
9
from os import PathLike
7✔
10
from pathlib import Path
7✔
11
from types import GeneratorType
7✔
12

13
import netCDF4 as nc  # noqa
7✔
14
import pandas as pd
7✔
15
import schema
7✔
16
import xarray as xr
7✔
17
import zarr
7✔
18
from pandas._libs.tslibs import NaTType  # noqa
7✔
19

20
from miranda.convert.utils import date_parser, find_version_hash  # noqa
7✔
21
from miranda.cv import VALIDATION_ENABLED
7✔
22
from miranda.scripting import LOGGING_CONFIG
7✔
23
from miranda.units import check_time_frequency
7✔
24

25
from ._time import TIME_UNITS_TO_FREQUENCY, TIME_UNITS_TO_TIMEDELTA, DecoderError
7✔
26

27
if VALIDATION_ENABLED:
7✔
28
    from miranda.cv import INSTITUTIONS, PROJECT_MODELS
7✔
29
    from miranda.validate import FACETS_SCHEMA  # noqa
7✔
30

31

32
logger = logging.getLogger("miranda.decode.decoder")
7✔
33

34
__all__ = [
7✔
35
    "Decoder",
36
    "guess_project",
37
]
38

39

40
def guess_project(file: os.PathLike | str) -> str:
7✔
41
    """Guess the name of the project
42

43
    Parameters
44
    ----------
45
    file : str or os.PathLike
46

47
    Returns
48
    -------
49
    str
50
    """
51
    file_name = Path(file).stem
×
52

53
    potential_names = file_name.split("_")
×
54
    if VALIDATION_ENABLED:
×
55
        for project, models in PROJECT_MODELS.items():
×
56
            if any([model in potential_names for model in models]):
×
57
                return project
×
58
        raise DecoderError(
×
59
            f"Unable to determine project from file name: '{file_name}'."
60
        )
61
    raise DecoderError("Project determination requires pyessv-archive source files.")
×
62

63

64
class Decoder:
7✔
65
    project = None
7✔
66
    guess = False
7✔
67
    _file_facets = dict()
7✔
68

69
    def __init__(self, project: str | None):
7✔
70
        self.project = project
×
71

72
    @staticmethod
7✔
73
    def _decoder(
7✔
74
        d: dict,
75
        fail_early: bool,
76
        proj: str,
77
        guess: bool,
78
        lock: mp.Lock,
79
        file: str | Path,
80
    ) -> None:
81
        if proj is None:
×
82
            if guess:
×
83
                try:
×
84
                    proj = guess_project(file)
×
85
                except DecoderError:
×
86
                    print(
×
87
                        "Unable to determine 'activity': Signature for 'activity' must be set manually for file: "
88
                        f"{file}."
89
                    )
90
                    if fail_early:
×
91
                        raise
×
92
            else:
93
                proj = "converted"
×
94

95
        decode_function_name = f"decode_{proj.lower().replace('-', '_')}"
×
96
        try:
×
97
            with lock:
×
98
                _deciphered = getattr(Decoder, decode_function_name)(Path(file))
×
99
                if fail_early:
×
100
                    if VALIDATION_ENABLED:
×
101
                        FACETS_SCHEMA.validate(_deciphered)
×
102
                    else:
103
                        print(
×
104
                            "Validation requires pyessv-archive source files. Skipping validation checks."
105
                        )
106
                print(
×
107
                    f"Deciphered the following from {Path(file).name}:\n"
108
                    f"{_deciphered.items()}"
109
                )
110
                d[file] = _deciphered
×
111

112
        except (AttributeError, NotImplementedError):
×
113
            print(f"Unable to read data from {Path(file)}. Ensure pathname is correct.")
×
114
            raise
×
115
        except schema.SchemaError as e:
×
116
            print(f"Decoded facets from {Path(file).name} are not valid: {e}")
×
117

118
    def decode(
7✔
119
        self,
120
        files: os.PathLike | str | list[str | os.PathLike] | GeneratorType,
121
        chunks: int | None = None,
122
        raise_error: bool = False,
123
    ) -> None:
124
        """Decode facets from file or list of files.
125

126
        Parameters
127
        ----------
128
        files : str or Path or list of str or Path or generator
129
            The files to decode.
130
        chunks : int, optional
131
            The chunk size used when processing files. Not to be confused with xarray chunks for dimensions.
132
        raise_error : bool
133
            Whether to raise an error if a file cannot be decoded.
134
        """
135
        if isinstance(files, (str, os.PathLike)):
×
136
            files = [files]
×
137

138
        if chunks is None and isinstance(files, list):
×
139
            if len(files) >= 10:
×
140
                chunk_size = 10
×
141
            elif 1 <= len(files) < 10:
×
142
                chunk_size = len(files)
×
143
            else:
144
                raise ValueError("No file entries found.")
×
145
        elif isinstance(files, GeneratorType):
×
146
            chunk_size = 10
×
147
        else:
148
            chunk_size = chunks
×
149

150
        if self.project is None:
×
151
            warnings.warn(
×
152
                "The decoder 'project' is not set; Decoding step will be much slower."
153
            )
154
        else:
155
            msg = f"Deciphering metadata with project = '{self.project}'"
×
NEW
156
            logger.info(msg)
×
157

158
        with mp.Manager() as manager:
×
159
            _file_facets = manager.dict()
×
160
            lock = manager.Lock()
×
161
            func = partial(
×
162
                self._decoder, _file_facets, raise_error, self.project, self.guess, lock
163
            )
164

165
            with mp.Pool() as pool:
×
166
                pool.imap(func, files, chunksize=chunk_size)
×
167
                pool.close()
×
168
                pool.join()
×
169

170
            self._file_facets.update(_file_facets)
×
171

172
    def facets_table(self):
7✔
173
        raise NotImplementedError()
×
174

175
    def file_facets(self) -> dict[os.PathLike, dict]:
7✔
176
        return self._file_facets
×
177

178
    @classmethod
7✔
179
    def _from_dataset(cls, file: Path | str) -> (str, str, dict):
7✔
180
        file_name = Path(file).stem
×
181

182
        try:
×
183
            variable_name = cls._decode_primary_variable(file)
×
184
        except DecoderError:
×
185
            msg = f"Unable to open dataset: {file.name}"
×
NEW
186
            logger.error(msg)
×
187
            raise
×
188

189
        datetimes = file_name.split("_")[-1]
×
190

191
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
192
            with nc.Dataset(file, mode="r") as ds:
×
193
                data = dict()
×
194
                for k in ds.ncattrs():
×
195
                    data[k] = getattr(ds, k)
×
196
        elif file.is_dir() and file.suffix == ".zarr":
×
197
            with zarr.open(file, mode="r") as ds:
×
198
                data = ds.attrs.asdict()
×
199
        else:
200
            raise DecoderError(f"Unable to read dataset: `{file.name}`.")
×
201
        return variable_name, datetimes, data
×
202

203
    @staticmethod
7✔
204
    def _decode_primary_variable(file: Path) -> str:
7✔
205
        """Attempts to find the primary variable of a netCDF
206

207
        Parameters
208
        ----------
209
        file: Path
210

211
        Returns
212
        -------
213
        str
214
        """
215
        dimsvar_dict = dict()
×
216
        coords = (
×
217
            "height",
218
            "lat",
219
            "latitude",
220
            "lev",
221
            "level",
222
            "lon",
223
            "longitude",
224
            "rlat",
225
            "rlon",
226
            "rotated_pole",
227
            "time",
228
        )
229
        try:
×
230
            if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
231
                with nc.Dataset(file, mode="r") as ds:
×
232
                    for var_name, var_attrs in ds.variables.items():
×
233
                        dimsvar_dict[var_name] = {
×
234
                            k: var_attrs.getncattr(k) for k in var_attrs.ncattrs()
235
                        }
236
                for k in dimsvar_dict.keys():
×
237
                    if not str(k).startswith(coords) and k in file.stem:
×
238
                        return str(k)
×
239

240
            elif file.is_dir() and file.suffix == ".zarr":
×
241
                with zarr.open(str(file), mode="r") as ds:
×
242
                    for k in ds.array_keys():
×
243
                        if not str(k).startswith(coords) and k in file.stem:
×
244
                            return str(k)
×
245
            else:
246
                raise NotImplementedError()
×
247
        except ValueError:
×
248
            raise DecoderError()
×
249

250
    @staticmethod
7✔
251
    def _decode_hour_of_day_info(
7✔
252
        file: PathLike | str,
253
    ) -> dict:
254
        """Decode hour of day information.
255

256
        Parameters
257
        ----------
258
        file : Path or str
259

260
        Returns
261
        -------
262
        dict
263
        """
264
        if isinstance(file, str):
×
265
            file = Path(file)
×
266

267
        if file.is_file() and file.suffix in [".nc", ".nc4"]:
×
268
            with nc.Dataset(file, mode="r") as ds:
×
269
                if "time" in ds.variables.keys():
×
270
                    hour = nc.num2date(
×
271
                        ds["time"][0], ds["time"].units, ds["time"].calendar
272
                    ).hour
273
                else:
274
                    hour = None
×
275
            return dict(hour_of_day=hour)
×
276

277
        elif file.is_dir() and file.suffix == ".zarr":
×
278
            warnings.warn("This is not currently implemented")
×
279

280
            # with zarr.open(str(file), mode="r") as ds:
281
            #     if "time" in ds.array_keys():
282
            #         pass
283

284
            return dict()
×
285

286
        else:
287
            raise NotImplementedError()
×
288

289
    @staticmethod
7✔
290
    def _decode_time_info(  # noqa: C901
7✔
291
        file: PathLike | str | list[str] | None = None,
292
        data: dict | None = None,
293
        term: str | None = None,
294
        *,
295
        field: str | None = None,
296
    ) -> str | NaTType:
297
        """Decode time information.
298

299
        Parameters
300
        ----------
301
        file : os.PathLike or str, optional
302
        data : dict, optional
303
        term : str
304
        field : {"timedelta", "frequency"}
305

306
        Returns
307
        -------
308
        str or NaTType
309
        """
310
        if not file and not data and not term:
×
311
            raise ValueError("Nothing passed to parse time info from.")
×
312

313
        if field == "frequency":
×
314
            time_dictionary = TIME_UNITS_TO_FREQUENCY
×
315
        elif field == "timedelta":
×
316
            time_dictionary = TIME_UNITS_TO_TIMEDELTA
×
317
        else:
318
            raise NotImplementedError()
×
319

320
        if term:
×
321
            if term in ["fx", "fixed"]:
×
322
                if field == "timedelta":
×
323
                    return pd.NaT
×
324
                return "fx"
×
325
            return pd.to_timedelta(time_dictionary[term])
×
326

327
        if data and not file:
×
328
            potential_time = data.get("frequency")
×
329
            if not potential_time:
×
330
                if hasattr(data, "time"):
×
331
                    time_units = data["time"].units
×
332
                    potential_time = time_units.split()[0]
×
333
                else:
334
                    msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
335

NEW
336
                    logger.warning(msg)
×
337
                    potential_time = "fx"
×
338
            if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
339
                msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
NEW
340
                logger.warning(msg)
×
341
                if field == "frequency":
×
342
                    return "fx"
×
343
                if field == "timedelta":
×
344
                    return pd.NaT
×
345
                raise ValueError()
×
346

347
            if field == "timedelta":
×
348
                if potential_time in ["fx", "fixed"]:
×
349
                    return pd.NaT
×
350
                return pd.to_timedelta(time_dictionary[potential_time])
×
351
            return time_dictionary[potential_time]
×
352

353
        if file and not data:
×
354
            for delimiter in ["_", "."]:
×
355
                file_parts = Path(file).stem.split(delimiter)
×
356
                potential_times = [
×
357
                    segment
358
                    for segment in file_parts
359
                    if segment in time_dictionary.keys()
360
                ]
361
                if potential_times:
×
362
                    if potential_times[0] in ["fx", "fixed"]:
×
363
                        if field == "frequency":
×
364
                            return "fx"
×
365
                        if field == "timedelta":
×
366
                            return pd.NaT
×
367
                        raise ValueError(f"Field `{field}` not supported.")
×
368
                    if field == "timedelta":
×
369
                        return pd.to_timedelta(time_dictionary[potential_times[0]])
×
370
                    return time_dictionary[potential_times[0]]
×
371

372
        if file and data:
×
373
            for delimiter in ["_", "."]:
×
374
                file_parts = Path(file).stem.split(delimiter)
×
375
                potential_times = [
×
376
                    segment
377
                    for segment in file_parts
378
                    if segment in time_dictionary.keys()
379
                ]
380
                potential_time = data.get("frequency", "")
×
381
                if potential_time == "":
×
382
                    if hasattr(data, "time"):
×
383
                        time_units = data["time"].units
×
384
                        potential_time = time_units.split()[0]
×
385
                    else:
386
                        msg = f"Could not find `frequency` or `time` for {Path(file).name}. Assuming `fx`."
×
387

NEW
388
                        logger.warning(msg)
×
389
                        potential_time = "fx"
×
390
                if potential_time in ["ymon", "yseas", "fixed", "fx"]:
×
391
                    msg = f"Found `{potential_time}`. Frequency is likely `fx`."
×
392

NEW
393
                    logger.warning(msg)
×
394
                    if "fx" in file_parts or "fixed" in file_parts:
×
395
                        if field == "frequency":
×
396
                            return "fx"
×
397
                        if field == "timedelta":
×
398
                            return pd.NaT
×
399
                        raise ValueError(f"Field `{field}` not supported.")
×
400

401
                if potential_time in potential_times:
×
402
                    return time_dictionary[potential_time]
×
403
                elif potential_times:
×
404
                    break
×
405

406
            msg = (
×
407
                f"Frequency from metadata (`{potential_time}`) not found in filename (`{Path(file).name}`): "
408
                "Performing more rigorous frequency checks."
409
            )
NEW
410
            logger.warning(msg)
×
411
            if Path(file).is_file() and Path(file).suffix in [".nc", ".nc4"]:
×
412
                engine = "h5netcdf"
×
413
            elif Path(file).is_dir() and Path(file).suffix == ".zarr":
×
414
                engine = "zarr"
×
415
            else:
416
                raise DecoderError(
×
417
                    f"File is not valid netcdf or zarr: {Path(file).name}"
418
                )
419

420
            _ds = xr.open_dataset(
×
421
                file,
422
                engine=engine,
423
                drop_variables="time_bnds",
424
            )
425
            if not hasattr(_ds, "time"):
×
NEW
426
                logger.warning(
×
427
                    "Dataset does not contain time array. Assuming fixed variable."
428
                )
429
                if field == "frequency":
×
430
                    return "fx"
×
431
                if field == "timedelta":
×
432
                    return pd.NaT
×
433
                raise ValueError(f"Field `{field}` not supported.")
×
434
            else:
435
                _, found_freq = check_time_frequency(_ds.time)
×
436

437
            if found_freq in potential_times:
×
438
                msg = (
×
439
                    "Time frequency found in dataset on analysis was found in filename. "
440
                    f"Metadata for `{Path(file).name} is probably incorrect. "
441
                    f"Basing fields on `{found_freq}`."
442
                )
NEW
443
                logger.warning(msg)
×
444
                return time_dictionary[found_freq]
×
445
            elif found_freq in ["month", "mon"]:
×
446
                for f in ["Amon", "Omon", "monC", "monthly", "months", "mon"]:
×
447
                    if f in potential_times:
×
448
                        msg = f"Month-like time frequency found in dataset on analysis was found in filename. Basing fields on `{f}`."
×
NEW
449
                        logger.warning(msg)
×
450
                        return time_dictionary[f]
×
451
            else:
452
                msg = (
×
453
                    "Time frequency found in dataset on analysis was not found in filename. "
454
                    f"Basing fields on `{found_freq}`."
455
                )
NEW
456
                logger.warning(msg)
×
457
                return time_dictionary[found_freq]
×
458
        raise DecoderError(f"Time frequency indiscernible for file `{file}`.")
×
459

460
    @staticmethod
7✔
461
    def _decode_version(file: PathLike | str, data: dict) -> dict:
7✔
462
        """Decode version information.
463

464
        Parameters
465
        ----------
466
        file : os.PathLike or str
467
        data : dict
468

469
        Returns
470
        -------
471
        dict
472
        """
473
        version_info = dict()
×
474
        try:
×
475
            version_info["version"] = data["version"]
×
476
        except KeyError:
×
477
            possible_version = Path(file).parent
×
478
            if re.match(r"^[vV]\d+", possible_version.name):
×
479
                version_info["version"] = possible_version.name
×
480
            else:
481
                possible_version_signature = possible_version.glob(
×
482
                    f"{Path(file).stem}.v*"
483
                )
484
                for sig in possible_version_signature:
×
485
                    found_version = re.match(r"([vV]\d+)$", sig.suffix)
×
486
                    if found_version:
×
487
                        version_info["version"] = found_version.group()
×
488
                        version_info["sha256sum"] = sig.open().read()
×
489
                        break
×
490
                else:
491
                    version_info["version"] = "vNotFound"
×
492
        return version_info
×
493

494
    @classmethod
7✔
495
    def decode_converted(cls, file: PathLike | str) -> dict:
7✔
496
        """Decode converted data.
497

498
        Parameters
499
        ----------
500
        file : os.PathLike or str
501

502
        Returns
503
        -------
504
        dict
505
        """
506
        facets = dict()
×
507
        try:
×
508
            variable, date, data = cls._from_dataset(file=file)
×
509
        except DecoderError:
×
510
            return facets
×
511

512
        facets.update(data)
×
513
        del facets["history"]
×
514

515
        facets["date"] = date
×
516

517
        file_format = data.get("output_format")
×
518
        if file_format:
×
519
            facets["format"] = file_format
×
520
        elif "format" in data:
×
521
            facets["format"] = data["format"]
×
522
        elif Path(file).suffix in [".nc", ".nc4"]:
×
523
            facets["format"] = "nc"
×
524
        elif Path(file).suffix in [".zarr"]:
×
525
            facets["format"] = "zarr"
×
526
        facets["variable"] = variable
×
527

528
        facets.update(cls._decode_version(data=data, file=file))
×
529
        facets.update(cls._decode_hour_of_day_info(file=file))
×
530

531
        try:
×
532
            if "frequency" not in facets:
×
533
                facets["timedelta"] = cls._decode_time_info(
×
534
                    data=data, file=file, field="frequency"
535
                )
536
            facets["timedelta"] = cls._decode_time_info(
×
537
                term=facets["frequency"], field="timedelta"
538
            )
539
            facets["date_start"] = date_parser(date)
×
540
            facets["date_end"] = date_parser(date, end_of_period=True)
×
541
        except DecoderError:  # noqa: S110
×
542
            pass
×
543

544
        return facets
×
545

546
    @staticmethod
7✔
547
    def decode_eccc_obs(self, file: PathLike | str) -> dict:
7✔
548
        raise NotImplementedError()
×
549

550
    @staticmethod
7✔
551
    def decode_ahccd_obs(self, file: PathLike | str) -> dict:
7✔
552
        raise NotImplementedError()
×
553

554
    @staticmethod
7✔
555
    def decode_melcc_obs(self, file: PathLike | str) -> dict:
7✔
556
        raise NotImplementedError()
×
557

558
    @classmethod
7✔
559
    def decode_pcic_candcs_u6(cls, file: PathLike | str) -> dict:
7✔
560
        if "Derived" in Path(file).parents:
×
561
            raise NotImplementedError("Derived CanDCS-U6 variables are not supported.")
×
562

563
        facets = dict()
×
564
        try:
×
565
            variable, date, data = cls._from_dataset(file=file)
×
566
        except DecoderError:
×
567
            return facets
×
568

569
        facets["activity"] = data["activity_id"]
×
570
        facets["mip_era"] = data["project_id"]
×
571
        facets["bias_adjust_institution"] = "PCIC"
×
572
        facets["date"] = date
×
573
        facets["domain"] = data["domain"]
×
574
        facets["experiment"] = str(data["GCM__experiment_id"]).replace(",", "-")
×
575
        facets["format"] = "netcdf"
×
576
        facets["institution"] = data["GCM__institution_id"]
×
577
        facets["member"] = (
×
578
            f"r{data['GCM__realization_index']}"
579
            f"i{data['GCM__initialization_index']}"
580
            f"p{data['GCM__physics_index']}"
581
            f"f{data['GCM__forcing_index']}"
582
        )
583
        facets["processing_level"] = "biasadjusted"
×
584
        facets["bias_adjust_project"] = "CanDCS-U6"
×
585
        facets["source"] = data["GCM__source_id"]
×
586
        facets["type"] = "simulation"
×
587
        facets["variable"] = variable
×
588

589
        facets["version"] = f"v{data.get('GCM__data_specs_version')}"
×
590
        if facets["version"] is None:
×
591
            facets.update(find_version_hash(file=file))
×
592

593
        facets.update(cls._decode_hour_of_day_info(file=file))
×
594

595
        try:
×
596
            facets["frequency"] = cls._decode_time_info(
×
597
                data=data, file=file, field="frequency"
598
            )
599
            facets["timedelta"] = cls._decode_time_info(
×
600
                term=facets["frequency"], field="timedelta"
601
            )
602
            facets["date_start"] = date_parser(date)
×
603
            facets["date_end"] = date_parser(date, end_of_period=True)
×
604
        except DecoderError:  # noqa: S110
×
605
            pass
×
606

607
        return facets
×
608

609
    @classmethod
7✔
610
    def decode_cmip6(cls, file: PathLike | str) -> dict:
7✔
611
        facets = dict()
×
612
        try:
×
613
            variable, date, data = cls._from_dataset(file=file)
×
614
        except DecoderError:
×
615
            return facets
×
616

617
        facets["activity"] = data["activity_id"]
×
618
        facets["date"] = date
×
619
        facets["domain"] = "global"
×
620
        facets["experiment"] = data["experiment_id"]
×
621
        facets["format"] = "netcdf"
×
622
        facets["grid_label"] = data["grid_label"]
×
623
        facets["institution"] = data["institution_id"]
×
624
        facets["member"] = data["variant_label"]
×
625
        facets["modeling_realm"] = data["realm"]
×
626
        facets["processing_level"] = "raw"
×
627
        facets["mip_era"] = data["mip_era"]
×
628
        facets["source"] = data["source_id"]
×
629
        facets["type"] = "simulation"
×
630
        facets["variable"] = variable
×
631
        facets.update(cls._decode_version(data=data, file=file))
×
632
        facets.update(cls._decode_hour_of_day_info(file=file))
×
633

634
        try:
×
635
            facets["frequency"] = cls._decode_time_info(
×
636
                data=data, file=file, field="frequency"
637
            )
638
            facets["timedelta"] = cls._decode_time_info(
×
639
                term=facets["frequency"], field="timedelta"
640
            )
641
            facets["date_start"] = date_parser(date)
×
642
            facets["date_end"] = date_parser(date, end_of_period=True)
×
643
        except DecoderError:  # noqa: S110
×
644
            pass
×
645

646
        return facets
×
647

648
    @classmethod
7✔
649
    def decode_cmip5(cls, file: PathLike | str) -> dict:
7✔
650
        facets = dict()
×
651
        try:
×
652
            variable, date, data = cls._from_dataset(file=file)
×
653
        except DecoderError:
×
654
            return facets
×
655

656
        facets["activity"] = "CMIP"
×
657
        facets["date"] = date
×
658
        facets["domain"] = "global"
×
659
        facets["experiment"] = data["experiment_id"]
×
660
        facets["format"] = "netcdf"
×
661
        facets["institution"] = data["institute_id"]
×
662
        facets["member"] = data["parent_experiment_rip"]
×
663
        facets["modeling_realm"] = data["modeling_realm"]
×
664
        facets["processing_level"] = "raw"
×
665
        facets["mip_era"] = data["project_id"]
×
666
        facets["source"] = data["model_id"]
×
667
        facets["type"] = "simulation"
×
668
        facets["variable"] = variable
×
669
        facets.update(cls._decode_version(data=data, file=file))
×
670
        facets.update(cls._decode_hour_of_day_info(file=file))
×
671

672
        try:
×
673
            facets["frequency"] = cls._decode_time_info(
×
674
                data=data, file=file, field="frequency"
675
            )
676
            facets["timedelta"] = cls._decode_time_info(
×
677
                term=facets["frequency"], field="timedelta"
678
            )
679
            facets["date_start"] = date_parser(date)
×
680
            facets["date_end"] = date_parser(date, end_of_period=True)
×
681
        except DecoderError:  # noqa: S110
×
682
            pass
×
683

684
        return facets
×
685

686
    @classmethod
7✔
687
    def decode_cordex(cls, file: PathLike | str) -> dict:
7✔
688
        facets = dict()
×
689
        try:
×
690
            variable, date, data = cls._from_dataset(file=file)
×
691
        except DecoderError:
×
692
            return dict()
×
693

694
        # FIXME: What to do about our internal data that breaks all established conventions?
695
        facets["activity"] = "CORDEX"
×
696

697
        if data.get("project_id") == "" or data.get("project_id") is None:
×
698
            facets["mip_era"] = "internal"
×
699
        elif data.get("project_id") == "CORDEX":
×
700
            facets["mip_era"] = "CMIP5"
×
701

702
        if date == "r0i0p0":
×
703
            facets["date"] = "fx"
×
704
        else:
705
            facets["date"] = date
×
706

707
        domain = data.get("CORDEX_domain")
×
708
        if domain:
×
709
            facets["domain"] = domain.strip()
×
710
        else:
711
            domain = data.get("ouranos_domain_name")
×
712
            if domain:
×
713
                facets["domain"] = domain.strip()
×
714
            else:
715
                msg = f"File {Path(file).name} has a nonstandard domain name."
×
NEW
716
                logger.error(msg)
×
717
                raise NotImplementedError(msg)
×
718

719
        # CORDEX-NAM on AWS mis-attributes the domain (22/44 should be 22i/44i)
720
        aws_keys = data.get("intake_esm_dataset_key")
×
721
        if aws_keys:
×
722
            facets["domain"] = aws_keys.split(".")[3]
×
723

724
        title = data.get("title")
×
725
        if title:
×
726
            regridded_domain_found = re.search(r"\w{3}-\d{2}i", title)
×
727
            if regridded_domain_found:
×
728
                facets["domain"] = regridded_domain_found.group()
×
729

730
        # The logic here is awful, but the information is bad to begin with.
731
        driving_model = ""
×
732
        driving_institution = ""
×
733

734
        driving_institution_parts = str(data["driving_model_id"]).split("-")
×
735
        if VALIDATION_ENABLED:
×
736
            if driving_institution_parts[0] in INSTITUTIONS:
×
737
                driving_institution = driving_institution_parts[0]
×
738
            elif "-".join(driving_institution_parts[:2]) in INSTITUTIONS:
×
739
                driving_institution = "-".join(driving_institution_parts[:2])
×
740
            elif "-".join(driving_institution_parts[:3]) in INSTITUTIONS:
×
741
                driving_institution = "-".join(driving_institution_parts[:3])
×
742
        else:
NEW
743
            logger.warning(
×
744
                "CORDEX Metadata validation checks require PyESSV. "
745
                "Driving institution cannot be determined."
746
            )
747
            driving_model = data["driving_model_id"]
×
748

749
        if data["driving_model_id"].startswith("GFDL"):
×
750
            driving_institution = "NOAA-GFDL"
×
751
            driving_model = f"NOAA-GFDL-{data['driving_model_id']}"
×
752
        elif data["driving_model_id"].startswith("MPI-ESM"):
×
753
            driving_institution = "MPI-M"
×
754
            driving_model = f"MPI-M-{data['driving_model_id']}"
×
755
        elif data["driving_model_id"].startswith("HadGEM2"):
×
756
            driving_institution = "MOHC"
×
757
            driving_model = f"MOHC-{data['driving_model_id']}"
×
758
        elif data["driving_model_id"].startswith("CNRM-CM5"):
×
759
            driving_institution = "CNRM-CERFACS"
×
760
            driving_model = f"CNRM-CERFACS-{data['driving_model_id']}"
×
761

762
        elif VALIDATION_ENABLED and not driving_institution:
×
763
            raise DecoderError(
×
764
                "driving_institution (from driving_model_id: "
765
                f"`{data['driving_model_id']}`) is not valid."
766
            )
767

768
        facets["driving_institution"] = driving_institution.strip()
×
769
        if driving_model:
×
770
            facets["driving_model"] = driving_model.strip()
×
771
        else:
772
            facets["driving_model"] = str(data["driving_model_id"]).strip()
×
773

774
        facets["format"] = "netcdf"
×
775

776
        if data["institute_id"].strip() == "Our.":
×
777
            facets["institution"] = "Ouranos"
×
778
        else:
779
            facets["institution"] = data["institute_id"].strip()
×
780

781
        facets["processing_level"] = "raw"
×
782
        facets["source"] = data["model_id"]
×
783
        facets["type"] = "simulation"
×
784
        facets["variable"] = variable
×
785

786
        facets.update(cls._decode_version(data=data, file=file))
×
787
        facets.update(cls._decode_hour_of_day_info(file=file))
×
788

789
        try:
×
790
            facets["frequency"] = cls._decode_time_info(
×
791
                data=data, file=file, field="frequency"
792
            )
793
            facets["timedelta"] = cls._decode_time_info(
×
794
                term=facets["frequency"], field="timedelta"
795
            )
796
            facets["date_start"] = date_parser(date)
×
797
            facets["date_end"] = date_parser(date, end_of_period=True)
×
798
        except DecoderError:  # noqa: S110
×
799
            pass
×
800

801
        try:
×
802
            facets["experiment"] = data["experiment_id"].strip()
×
803
        except KeyError:
×
804
            facets["experiment"] = data["driving_experiment_name"].strip()
×
805

806
        try:
×
807
            for potential_member in ["parent_experiment_rip", "parent_experiment"]:
×
808
                facets["member"] = data.get(potential_member)
×
809
                if facets["member"] == "N/A":
×
810
                    raise KeyError()
×
811
                else:
812
                    break
×
813
            if facets["member"] is None:
×
814
                raise KeyError()
×
815
        except KeyError:
×
816
            facets["member"] = data["driving_model_ensemble_member"].strip()
×
817

818
        return facets
×
819

820
    @classmethod
7✔
821
    def decode_isimip_ft(cls, file: PathLike | str) -> dict:
7✔
822
        facets = dict()
×
823
        try:
×
824
            variable, date, data = cls._from_dataset(file=file)
×
825
        except DecoderError:
×
826
            return facets
×
827

828
        facets["activity"] = "ISIMIP"
×
829
        facets["mip_era"] = data["project_id"]
×
830
        facets["date"] = date
×
831
        facets["domain"] = "global"
×
832
        facets["co2_forcing_id"] = data["co2_forcing_id"]
×
833
        facets["experiment"] = data["experiment_id"]
×
834
        facets["format"] = "netcdf"
×
835
        facets["impact_model"] = data["impact_model_id"]
×
836
        facets["institution"] = data["institute_id"]
×
837
        facets["member"] = data["driving_model_ensemble_member"]
×
838
        facets["modeling_realm"] = data["modeling_realm"]
×
839
        facets["social_forcing_id"] = data["social_forcing_id"]
×
840
        facets["source"] = data["model_id"]
×
841
        facets["type"] = "simulation"
×
842
        facets["variable"] = variable
×
843

844
        facets.update(cls._decode_version(data=data, file=file))
×
845
        facets.update(cls._decode_hour_of_day_info(file=file))
×
846

847
        try:
×
848
            facets["frequency"] = cls._decode_time_info(data=data, field="frequency")
×
849
            facets["timedelta"] = cls._decode_time_info(
×
850
                term=facets["frequency"], field="timedelta"
851
            )
852
            facets["date_start"] = date_parser(date)
×
853
            facets["date_end"] = date_parser(date, end_of_period=True)
×
854
        except DecoderError:  # noqa: S110
×
855
            pass
×
856

857
        return facets
×
858

859
    @classmethod
7✔
860
    def decode_nex_gddp_cmip6(cls, file: PathLike | str) -> dict:
7✔
861
        facets = dict()
×
862
        try:
×
863
            variable, date, data = cls._from_dataset(file=file)
×
864
        except DecoderError:
×
865
            return facets
×
866

867
        facets["experiment"] = data["scenario"]
×
868
        facets["activity"] = (
×
869
            "CMIP" if facets["experiment"] == "historical" else "ScenarioMIP"
870
        )
871
        facets["institution"] = data["cmip6_institution_id"]
×
872
        facets["member"] = data["variant_label"]
×
873
        facets["processing_level"] = "biasadjusted"
×
874
        facets["bias_adjust_project"] = "NEX-GDDP-CMIP6"
×
875
        facets["bias_adjust_institution"] = "NASA"
×
876
        facets["mip_era"] = "CMIP6"
×
877
        facets["source"] = data["cmip6_source_id"]
×
878
        facets["type"] = "simulation"
×
879
        facets["variable"] = variable
×
880
        facets.update(cls._decode_version(data=data, file=file))
×
881
        facets.update(cls._decode_hour_of_day_info(file=file))
×
882

883
        try:
×
884
            facets["frequency"] = cls._decode_time_info(
×
885
                data=data, file=file, field="frequency"
886
            )
887
            facets["timedelta"] = cls._decode_time_info(
×
888
                term=facets["frequency"], field="timedelta"
889
            )
890
            facets["date_start"] = date_parser(date)
×
891
            facets["date_end"] = date_parser(date, end_of_period=True)
×
892
        except DecoderError:  # noqa: S110
×
893
            pass
×
894

895
        return facets
×
896

897
    @classmethod
7✔
898
    def decode_espo_g6_r2(cls, file: PathLike | str) -> dict:
7✔
899
        facets = dict()
×
900
        try:
×
901
            variable, date, data = cls._from_dataset(file=file)
×
902
        except DecoderError:
×
903
            return facets
×
904

905
        facets["bias_adjust_project"] = "ESPO-G6-R2"
×
906
        facets["processing_level"] = "biasadjusted"
×
907
        facets["version"] = "1.0.0"
×
908
        facets["domain"] = "NAM"
×
909
        for f in [
×
910
            "experiment",
911
            "activity",
912
            "institution",
913
            "member",
914
            "bias_adjust_institution",
915
            "mip_era",
916
            "source",
917
            "type",
918
        ]:
919
            facets[f] = data[f"cat:{f}"]
×
920
        facets["variable"] = variable
×
921
        # facets.update(cls._decode_version(data=data, file=file))
922
        facets.update(cls._decode_hour_of_day_info(file=file))
×
923

924
        try:
×
925
            facets["frequency"] = cls._decode_time_info(
×
926
                data=data, file=file, field="frequency"
927
            )
928
            facets["timedelta"] = cls._decode_time_info(
×
929
                term=facets["frequency"], field="timedelta"
930
            )
931
            facets["date_start"] = date_parser(date)
×
932
            facets["date_end"] = date_parser(date, end_of_period=True)
×
933
        except DecoderError:  # noqa: S110
×
934
            pass
×
935

936
        return facets
×
937

938
    @classmethod
7✔
939
    def decode_espo_g6_e5l(cls, file: PathLike | str) -> dict:
7✔
940
        facets = dict()
×
941
        try:
×
942
            variable, date, data = cls._from_dataset(file=file)
×
943
        except DecoderError:
×
944
            return facets
×
945

946
        facets["bias_adjust_project"] = "ESPO-G6-E5L"
×
947
        facets["processing_level"] = "biasadjusted"
×
948
        facets["version"] = "1.0.0"
×
949
        facets["domain"] = "NAM"
×
950
        for f in [
×
951
            "experiment",
952
            "activity",
953
            "institution",
954
            "member",
955
            "bias_adjust_institution",
956
            "mip_era",
957
            "source",
958
            "type",
959
        ]:
960
            facets[f] = data[f"cat:{f}"]
×
961
        facets["variable"] = variable
×
962
        # facets.update(cls._decode_version(data=data, file=file))
963
        facets.update(cls._decode_hour_of_day_info(file=file))
×
964

965
        try:
×
966
            facets["frequency"] = cls._decode_time_info(
×
967
                data=data, file=file, field="frequency"
968
            )
969
            facets["timedelta"] = cls._decode_time_info(
×
970
                term=facets["frequency"], field="timedelta"
971
            )
972
            facets["date_start"] = date_parser(date)
×
973
            facets["date_end"] = date_parser(date, end_of_period=True)
×
974
        except DecoderError:  # noqa: S110
×
975
            pass
×
976

977
        return facets
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc