• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

konstantinstadler / pymrio / 5866588548

pending completion
5866588548

push

github-actions

konstantinstadler
include Hazim contribution in 0.5.2 release

1959 of 2243 relevant lines covered (87.34%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.56
/pymrio/tools/ioparser.py
1
"""
2
Various parser for available MRIOs and files in a similar format
3
as
4

5
KST 20140903
6
"""
7

8
import logging
1✔
9
import os
1✔
10
import re
1✔
11
import warnings
1✔
12
import zipfile
1✔
13
from collections import namedtuple
1✔
14

15
import numpy as np
1✔
16
import pandas as pd
1✔
17

18
# Constants and global variables
19
from pymrio.core.constants import PYMRIO_PATH
1✔
20
from pymrio.core.fileio import load_all
1✔
21
from pymrio.core.mriosystem import Extension, IOSystem
1✔
22
from pymrio.tools.iometadata import MRIOMetaData
1✔
23
from pymrio.tools.ioutil import get_repo_content, sniff_csv_format
1✔
24

25

26
# Exceptions
27
class ParserError(Exception):
1✔
28
    """Base class for errors concerning parsing of IO source files"""
29

30
    pass
1✔
31

32

33
class ParserWarning(UserWarning):
1✔
34
    """Base class for warnings concerning parsing of IO source files"""
35

36
    pass
1✔
37

38

39
IDX_NAMES = {
1✔
40
    "Z_col": ["region", "sector"],
41
    "Z_row": ["region", "sector"],
42
    "Z_row_unit": ["region", "sector", "unit"],
43
    "A_col": ["region", "sector"],
44
    "A_row": ["region", "sector"],
45
    "A_row_unit": ["region", "sector", "unit"],
46
    "Y_col1": ["region"],
47
    "Y_col2": ["region", "category"],
48
    "Y_row": ["region", "sector"],
49
    "Y_row_unit": ["region", "sector", "unit"],
50
    "F_col": ["region", "sector"],
51
    "F_row_single": ["stressor"],
52
    "F_row_unit": ["stressor", "unit"],
53
    "F_row_comp_unit": ["stressor", "compartment", "unit"],
54
    "F_row_src_unit": ["stressor", "source", "unit"],
55
    "F_row_src": ["stressor", "source"],
56
    "VA_row_single": ["inputtype"],
57
    "VA_row_unit": ["inputtype", "unit"],
58
    "VA_row_unit_cat": ["inputtype", "category"],
59
    "unit": ["unit"],
60
    "_reg_sec_unit": ["region", "sector", "unit"],
61
}
62

63

64
# Top level functions
65
def parse_exio12_ext(
1✔
66
    ext_file,
67
    index_col,
68
    name,
69
    drop_compartment=True,
70
    version=None,
71
    year=None,
72
    iosystem=None,
73
    sep=",",
74
):
75
    """Parse an EXIOBASE version 1 or 2 like extension file into pymrio.Extension
76

77
    EXIOBASE like extensions files are assumed to have two
78
    rows which are used as columns multiindex (region and sector)
79
    and up to three columns for the row index (see Parameters).
80

81
    For EXIOBASE 3 - extension can be loaded directly with pymrio.load
82

83
    Notes
84
    -----
85
    So far this only parses factor of production extensions F (not
86
    final demand extensions F_Y nor coeffiecents S).
87

88
    Parameters
89
    ----------
90

91
    ext_file : string or pathlib.Path
92
        File to parse
93

94
    index_col : int
95
        The number of columns (1 to 3) at the beginning of the file
96
        to use as the index. The order of the index_col must be
97
        - 1 index column: ['stressor']
98
        - 2 index columns: ['stressor', 'unit']
99
        - 3 index columns: ['stressor', 'compartment', 'unit']
100
        - > 3: everything up to three index columns will be removed
101

102
    name : string
103
        Name of the extension
104

105
    drop_compartment : boolean, optional
106
        If True (default) removes the compartment from the index.
107

108
    version : string, optional
109
        see pymrio.Extension
110

111
    iosystem : string, optional
112
        see pymrio.Extension
113

114
    year : string or int
115
        see pymrio.Extension
116

117
    sep : string, optional
118
        Delimiter to use; default ','
119

120
    Returns
121
    -------
122
    pymrio.Extension
123
        with F (and unit if available)
124

125
    """
126

127
    ext_file = os.path.abspath(str(ext_file))
1✔
128

129
    F = pd.read_csv(ext_file, header=[0, 1], index_col=list(range(index_col)), sep=sep)
1✔
130

131
    F.columns.names = ["region", "sector"]
1✔
132

133
    if index_col == 1:
1✔
134
        F.index.names = ["stressor"]
1✔
135

136
    elif index_col == 2:
1✔
137
        F.index.names = ["stressor", "unit"]
1✔
138

139
    elif index_col == 3:
1✔
140
        F.index.names = ["stressor", "compartment", "unit"]
1✔
141

142
    else:
143
        F.reset_index(level=list(range(3, index_col)), drop=True, inplace=True)
1✔
144
        F.index.names = ["stressor", "compartment", "unit"]
1✔
145

146
    unit = None
1✔
147
    if index_col > 1:
1✔
148
        unit = pd.DataFrame(F.iloc[:, 0].reset_index(level="unit").unit)
1✔
149
        F.reset_index(level="unit", drop=True, inplace=True)
1✔
150

151
    if drop_compartment:
1✔
152
        try:
1✔
153
            F.reset_index(level="compartment", drop=True, inplace=True)
1✔
154
            unit.reset_index(level="compartment", drop=True, inplace=True)
1✔
155
        except KeyError:
1✔
156
            # In case compartment was not part to begin with
157
            pass
1✔
158

159
    return Extension(
1✔
160
        name=name,
161
        F=F,
162
        unit=unit,
163
        iosystem=iosystem,
164
        version=version,
165
        year=year,
166
    )
167

168

169
def get_exiobase12_version(filename):
1✔
170
    """Returns the EXIOBASE version for the given filename,
171
    None if not found
172
    """
173
    try:
1✔
174
        ver_match = re.search(r"(\d+\w*(\.|\-|\_))*\d+\w*", filename)
1✔
175
        version = ver_match.string[ver_match.start() : ver_match.end()]
1✔
176
        if re.search(r"\_\d\d\d\d", version[-5:]):
1✔
177
            version = version[:-5]
×
178
    except AttributeError:
1✔
179
        version = None
1✔
180

181
    return version
1✔
182

183

184
def get_exiobase_files(path, coefficients=True):
1✔
185
    """Gets the EXIOBASE files in path (which can be a zip file)
186

187
    Parameters
188
    ----------
189
    path: str or pathlib.Path
190
        Path to exiobase files or zip file
191
    coefficients: boolean, optional
192
        If True (default), considers the mrIot file as A matrix,
193
        and the extensions as S matrices. Otherwise as Z and F, respectively
194

195
    Returns
196
    -------
197
    dict of dict
198
    """
199
    path = os.path.normpath(str(path))
1✔
200
    if coefficients:
1✔
201
        exio_core_regex = dict(
1✔
202
            # don’t match file if starting with _
203
            A=re.compile(r"(?<!\_)mrIot.*txt"),
204
            Y=re.compile(r"(?<!\_)mrFinalDemand.*txt"),
205
            S_factor_inputs=re.compile(r"(?<!\_)mrFactorInputs.*txt"),
206
            S_emissions=re.compile(r"(?<!\_)mrEmissions.*txt"),
207
            S_materials=re.compile(r"(?<!\_)mrMaterials.*txt"),
208
            S_resources=re.compile(r"(?<!\_)mrResources.*txt"),
209
            F_Y_resources=re.compile(r"(?<!\_)mrFDResources.*txt"),
210
            F_Y_emissions=re.compile(r"(?<!\_)mrFDEmissions.*txt"),
211
            F_Y_materials=re.compile(r"(?<!\_)mrFDMaterials.*txt"),
212
        )
213
    else:
214
        exio_core_regex = dict(
×
215
            # don’t match file if starting with _
216
            Z=re.compile(r"(?<!\_)mrIot.*txt"),
217
            Y=re.compile(r"(?<!\_)mrFinalDemand.*txt"),
218
            F_factor_inputs=re.compile(r"(?<!\_)mrFactorInputs.*txt"),
219
            F_emissions=re.compile(r"(?<!\_)mrEmissions.*txt"),
220
            F_materials=re.compile(r"(?<!\_)mrMaterials.*txt"),
221
            F_resources=re.compile(r"(?<!\_)mrResources.*txt"),
222
            F_Y_emissions=re.compile(r"(?<!\_)mrFDEmissions.*txt"),
223
            F_Y_materials=re.compile(r"(?<!\_)mrFDMaterials.*txt"),
224
        )
225

226
    repo_content = get_repo_content(path)
1✔
227

228
    exio_files = dict()
1✔
229
    for kk, vv in exio_core_regex.items():
1✔
230
        found_file = [
1✔
231
            vv.search(ff).string for ff in repo_content.filelist if vv.search(ff)
232
        ]
233
        if len(found_file) > 1:
1✔
234
            logging.warning(
×
235
                "Multiple files found for {}: {}"
236
                " - USING THE FIRST ONE".format(kk, found_file)
237
            )
238
            found_file = found_file[0:1]
×
239
        elif len(found_file) == 0:
1✔
240
            continue
1✔
241
        else:
242
            logging.debug(f"Process file {found_file[0]}")
1✔
243
            if repo_content.iszip:
1✔
244
                format_para = sniff_csv_format(found_file[0], zip_file=path)
1✔
245
            else:
246
                format_para = sniff_csv_format(os.path.join(path, found_file[0]))
1✔
247
            exio_files[kk] = dict(
1✔
248
                root_repo=path,
249
                file_path=found_file[0],
250
                version=get_exiobase12_version(os.path.basename(found_file[0])),
251
                index_rows=format_para["nr_header_row"],
252
                index_col=format_para["nr_index_col"],
253
                unit_col=format_para["nr_index_col"] - 1,
254
                sep=format_para["sep"],
255
            )
256

257
    return exio_files
1✔
258

259

260
def generic_exiobase12_parser(exio_files, system=None):
1✔
261
    """Generic EXIOBASE version 1 and 2 parser
262

263
    This is used internally by parse_exiobase1 / 2 functions to
264
    parse exiobase files. In most cases, these top-level functions
265
    should just work, but in case of archived exiobase versions
266
    it might be necessary to use low-level function here.
267

268
    Parameters
269
    ----------
270

271
    exio_files: dict of dict
272

273
    system: str (pxp or ixi)
274
        Only used for the metadata
275

276
    """
277

278
    version = " & ".join(
1✔
279
        {dd.get("version", "") for dd in exio_files.values() if dd.get("version", "")}
280
    )
281

282
    meta_rec = MRIOMetaData(system=system, name="EXIOBASE", version=version)
1✔
283

284
    if len(version) == 0:
1✔
285
        meta_rec.note("No version information found, assuming exiobase 1")
1✔
286
        meta_rec.change_meta("version", 1)
1✔
287
        version = "1"
1✔
288

289
    core_components = ["A", "Y", "Z"]
1✔
290

291
    core_data = dict()
1✔
292
    ext_data = dict()
1✔
293
    for tt, tpara in exio_files.items():
1✔
294
        full_file_path = os.path.join(tpara["root_repo"], tpara["file_path"])
1✔
295
        logging.debug("Parse {}".format(full_file_path))
1✔
296
        if tpara["root_repo"][-3:] == "zip":
1✔
297
            with zipfile.ZipFile(tpara["root_repo"], "r") as zz:
1✔
298
                raw_data = pd.read_csv(
1✔
299
                    zz.open(tpara["file_path"]),
300
                    index_col=list(range(tpara["index_col"])),
301
                    header=list(range(tpara["index_rows"])),
302
                    sep="\t",
303
                )
304
        else:
305
            raw_data = pd.read_csv(
1✔
306
                full_file_path,
307
                index_col=list(range(tpara["index_col"])),
308
                header=list(range(tpara["index_rows"])),
309
                sep="\t",
310
            )
311

312
        meta_rec._add_fileio(
1✔
313
            "EXIOBASE data {} parsed from {}".format(tt, full_file_path)
314
        )
315
        if tt in core_components:
1✔
316
            core_data[tt] = raw_data
1✔
317
        else:
318
            ext_data[tt] = raw_data
1✔
319

320
    for table in core_data:
1✔
321
        core_data[table].index.names = ["region", "sector", "unit"]
1✔
322
        if table == "A" or table == "Z":
1✔
323
            core_data[table].columns.names = ["region", "sector"]
1✔
324
            _unit = (
1✔
325
                pd.DataFrame(core_data[table].iloc[:, 0]).reset_index(level="unit").unit
326
            )
327
            _unit = pd.DataFrame(_unit)
1✔
328
            _unit.columns = ["unit"]
1✔
329
        if table == "Y":
1✔
330
            core_data[table].columns.names = ["region", "category"]
1✔
331
        core_data[table].reset_index(level="unit", drop=True, inplace=True)
1✔
332

333
    core_data["unit"] = _unit
1✔
334

335
    mon_unit = core_data["unit"].iloc[0, 0]
1✔
336
    if "/" in mon_unit:
1✔
337
        mon_unit = mon_unit.split("/")[0]
1✔
338
        core_data["unit"].unit = mon_unit
1✔
339

340
    extensions = dict()
1✔
341
    for tt, tpara in exio_files.items():
1✔
342
        if tt in core_components:
1✔
343
            continue
1✔
344

345
        # The following depends on the format (upper/lower case) of the
346
        # dict keys returned by get_exiobase_files
347
        ext_name = "_".join(re.findall(r"[a-z]+", tt))
1✔
348
        table_type = re.match(r"[A-Z_]+", tt)[0].rstrip("_")
1✔
349

350
        if tpara["index_col"] == 3:
1✔
351
            ext_data[tt].index.names = ["stressor", "compartment", "unit"]
1✔
352
        elif tpara["index_col"] == 2:
1✔
353
            ext_data[tt].index.names = ["stressor", "unit"]
1✔
354
        else:
355
            raise ParserError("Unknown EXIOBASE file structure")
×
356

357
        if table_type == "F_Y":
1✔
358
            ext_data[tt].columns.names = ["region", "category"]
1✔
359
        else:
360
            ext_data[tt].columns.names = ["region", "sector"]
1✔
361
        try:
1✔
362
            _unit = pd.DataFrame(ext_data[tt].iloc[:, 0]).reset_index(level="unit").unit
1✔
363
        except IndexError:
×
364
            _unit = pd.DataFrame(ext_data[tt].iloc[:, 0])
×
365
            _unit.columns = ["unit"]
×
366
            _unit["unit"] = "undef"
×
367
            _unit.reset_index(level="unit", drop=True, inplace=True)
×
368
            _unit = pd.DataFrame(_unit)
×
369
            _unit.columns = ["unit"]
×
370

371
        _unit = pd.DataFrame(_unit)
1✔
372
        _unit.columns = ["unit"]
1✔
373
        _new_unit = _unit.unit.str.replace("/" + mon_unit, "", regex=True)
1✔
374
        _new_unit[_new_unit == ""] = _unit.unit[_new_unit == ""].str.replace(
1✔
375
            "/", "", regex=True
376
        )
377
        _unit.unit = _new_unit
1✔
378

379
        ext_data[tt].reset_index(level="unit", drop=True, inplace=True)
1✔
380
        ext_dict = extensions.get(ext_name, dict())
1✔
381
        ext_dict.update({table_type: ext_data[tt], "unit": _unit, "name": ext_name})
1✔
382
        extensions.update({ext_name: ext_dict})
1✔
383

384
    if version[0] == "1":
1✔
385
        year = 2000
1✔
386
    elif version[0] == "2":
1✔
387
        year = 2000
1✔
388
    elif version[0] == "3":
×
389
        raise ParserError("This function can not be used to parse EXIOBASE 3")
×
390
    else:
391
        logging.warning("Unknown EXIOBASE version")
×
392
        year = None
×
393

394
    return IOSystem(
1✔
395
        version=version,
396
        price="current",
397
        year=year,
398
        meta=meta_rec,
399
        **dict(core_data, **extensions),
400
    )
401

402

403
def _get_MRIO_system(path):
1✔
404
    """Extract system information (ixi, pxp) from file path.
405

406
    Returns 'ixi' or 'pxp', None in undetermined
407
    """
408
    ispxp = True if re.search("pxp", path, flags=re.IGNORECASE) else False
1✔
409
    isixi = True if re.search("ixi", path, flags=re.IGNORECASE) else False
1✔
410

411
    if ispxp == isixi:
1✔
412
        system = None
1✔
413
    else:
414
        system = "pxp" if ispxp else "ixi"
1✔
415
    return system
1✔
416

417

418
def parse_exiobase1(path):
1✔
419
    """Parse the exiobase1 raw data files.
420

421
    This function works with
422

423
    - pxp_ita_44_regions_coeff_txt
424
    - ixi_fpa_44_regions_coeff_txt
425
    - pxp_ita_44_regions_coeff_src_txt
426
    - ixi_fpa_44_regions_coeff_src_txt
427

428
    which can be found on www.exiobase.eu
429

430
    The parser works with the compressed (zip) files as well as the unpacked
431
    files.
432

433
    Parameters
434
    ----------
435
    path : pathlib.Path or string
436
        Path of the exiobase 1 data
437

438
    Returns
439
    -------
440
    pymrio.IOSystem with exio1 data
441

442
    """
443
    path = os.path.abspath(os.path.normpath(str(path)))
1✔
444

445
    exio_files = get_exiobase_files(path)
1✔
446
    if len(exio_files) == 0:
1✔
447
        raise ParserError("No EXIOBASE files found at {}".format(path))
1✔
448

449
    system = _get_MRIO_system(path)
1✔
450
    if not system:
1✔
451
        logging.warning(
1✔
452
            "Could not determine system (pxp or ixi)" " set system parameter manually"
453
        )
454

455
    io = generic_exiobase12_parser(exio_files, system=system)
1✔
456
    return io
1✔
457

458

459
def parse_exiobase2(path, charact=True, popvector="exio2"):
1✔
460
    """Parse the exiobase 2.2.2 source files for the IOSystem
461

462
    The function parse product by product and industry by industry source file
463
    in the coefficient form (A and S).
464

465
    Filenames are hardcoded in the parser - for any other function the code has
466
    to be adopted. Check git comments to find older verions.
467

468
    Parameters
469
    ----------
470
    path : string or pathlib.Path
471
        Path to the EXIOBASE source files
472
    charact : string or boolean, optional
473
        Filename with path to the characterisation matrices for the extensions
474
        (xls). This is provided together with the EXIOBASE system and given as
475
        a xls file. The four sheets  Q_factorinputs, Q_emission, Q_materials
476
        and Q_resources are read and used to generate one new extensions with
477
        the impacts.
478
        If set to True, the characterisation file found in path is used (
479
        can be in the zip or extracted). If a string, it is assumed that
480
        it points to valid characterisation file. If False or None, no
481
        characterisation file will be used.
482
    popvector : string or pd.DataFrame, optional
483
        The population vector for the countries.  This can be given as
484
        pd.DataFrame(index = population, columns = countrynames) or, (default)
485
        will be taken from the pymrio module. If popvector = None no population
486
        data will be passed to the IOSystem.
487

488
    Returns
489
    -------
490
    IOSystem
491
        A IOSystem with the parsed exiobase 2 data
492

493
    Raises
494
    ------
495
    ParserError
496
        If the exiobase source files are not complete in the given path
497

498
    """
499
    path = os.path.abspath(os.path.normpath(str(path)))
1✔
500

501
    exio_files = get_exiobase_files(path)
1✔
502
    if len(exio_files) == 0:
1✔
503
        raise ParserError("No EXIOBASE files found at {}".format(path))
1✔
504

505
    system = _get_MRIO_system(path)
1✔
506
    if not system:
1✔
507
        logging.warning(
×
508
            "Could not determine system (pxp or ixi)" " set system parameter manually"
509
        )
510

511
    io = generic_exiobase12_parser(exio_files, system=system)
1✔
512

513
    # read the characterisation matrices if available
514
    # and build one extension with the impacts
515
    if charact:
1✔
516
        logging.debug("Parse characterisation matrix")
1✔
517
        # dict with correspondence to the extensions
518
        Qsheets = {
1✔
519
            "Q_factorinputs": "factor_inputs",
520
            "Q_emission": "emissions",
521
            "Q_materials": "materials",
522
            "Q_resources": "resources",
523
        }
524

525
        Q_head_col = dict()
1✔
526
        Q_head_row = dict()
1✔
527
        Q_head_col_rowname = dict()
1✔
528
        Q_head_col_rowunit = dict()
1✔
529
        # Q_head_col_metadata = dict()
530
        # number of cols containing row headers at the beginning
531
        Q_head_col["Q_emission"] = 4
1✔
532
        # number of rows containing col headers at the top - this will be
533
        # skipped
534
        Q_head_row["Q_emission"] = 3
1✔
535
        # assuming the same classification as in the extensions
536
        Q_head_col["Q_factorinputs"] = 2
1✔
537
        Q_head_row["Q_factorinputs"] = 2
1✔
538
        Q_head_col["Q_resources"] = 2
1✔
539
        Q_head_row["Q_resources"] = 3
1✔
540
        Q_head_col["Q_materials"] = 2
1✔
541
        Q_head_row["Q_materials"] = 2
1✔
542

543
        #  column to use as name for the rows
544
        Q_head_col_rowname["Q_emission"] = 1
1✔
545
        Q_head_col_rowname["Q_factorinputs"] = 0
1✔
546
        Q_head_col_rowname["Q_resources"] = 0
1✔
547
        Q_head_col_rowname["Q_materials"] = 0
1✔
548

549
        # column to use as unit for the rows which gives also the last column
550
        # before the data
551
        Q_head_col_rowunit["Q_emission"] = 3
1✔
552
        Q_head_col_rowunit["Q_factorinputs"] = 1
1✔
553
        Q_head_col_rowunit["Q_resources"] = 1
1✔
554
        Q_head_col_rowunit["Q_materials"] = 1
1✔
555

556
        if charact is str:
1✔
557
            charac_data = {
×
558
                Qname: pd.read_excel(
559
                    charact,
560
                    sheet_name=Qname,
561
                    skiprows=list(range(0, Q_head_row[Qname])),
562
                    header=None,
563
                )
564
                for Qname in Qsheets
565
            }
566
        else:
567
            _content = get_repo_content(path)
1✔
568
            charac_regex = re.compile(r"(?<!\_)(?<!\.)characterisation.*xlsx")
1✔
569
            charac_files = [
1✔
570
                ff for ff in _content.filelist if re.search(charac_regex, ff)
571
            ]
572
            if len(charac_files) > 1:
1✔
573
                raise ParserError(
×
574
                    "Found multiple characcterisation files "
575
                    "in {} - specify one: {}".format(path, charac_files)
576
                )
577
            elif len(charac_files) == 0:
1✔
578
                raise ParserError(
×
579
                    "No characcterisation file found " "in {}".format(path)
580
                )
581
            else:
582
                if _content.iszip:
1✔
583
                    with zipfile.ZipFile(path, "r") as zz:
1✔
584
                        charac_data = {
1✔
585
                            Qname: pd.read_excel(
586
                                zz.open(charac_files[0]),
587
                                sheet_name=Qname,
588
                                skiprows=list(range(0, Q_head_row[Qname])),
589
                                header=None,
590
                            )
591
                            for Qname in Qsheets
592
                        }
593

594
                else:
595
                    charac_data = {
×
596
                        Qname: pd.read_excel(
597
                            os.path.join(path, charac_files[0]),
598
                            sheet_name=Qname,
599
                            skiprows=list(range(0, Q_head_row[Qname])),
600
                            header=None,
601
                        )
602
                        for Qname in Qsheets
603
                    }
604

605
        _unit = dict()
1✔
606
        # temp for the calculated impacts which than
607
        # get summarized in the 'impact'
608
        _impact = dict()
1✔
609
        impact = dict()
1✔
610
        for Qname in Qsheets:
1✔
611
            # unfortunately the names in Q_emissions are
612
            # not completely unique - fix that
613
            if Qname == "Q_emission":
1✔
614
                _index = charac_data[Qname][Q_head_col_rowname[Qname]].copy()
1✔
615
                _index.iloc[42] = _index.iloc[42] + " 2008"
1✔
616
                _index.iloc[43] = _index.iloc[43] + " 2008"
1✔
617
                _index.iloc[44] = _index.iloc[44] + " 2010"
1✔
618
                _index.iloc[45] = _index.iloc[45] + " 2010"
1✔
619
                charac_data[Qname][Q_head_col_rowname[Qname]] = _index
1✔
620

621
            charac_data[Qname].index = charac_data[Qname][Q_head_col_rowname[Qname]]
1✔
622

623
            _unit[Qname] = pd.DataFrame(
1✔
624
                charac_data[Qname].iloc[:, Q_head_col_rowunit[Qname]]
625
            )
626
            _unit[Qname].columns = ["unit"]
1✔
627
            _unit[Qname].index.name = "impact"
1✔
628
            charac_data[Qname] = charac_data[Qname].iloc[
1✔
629
                :, Q_head_col_rowunit[Qname] + 1 :
630
            ]
631
            charac_data[Qname].index.name = "impact"
1✔
632

633
            try:
1✔
634
                _F_Y = io.__dict__[Qsheets[Qname]].F_Y.values
1✔
635
            except AttributeError:
1✔
636
                _F_Y = np.zeros([io.__dict__[Qsheets[Qname]].S.shape[0], io.Y.shape[1]])
1✔
637

638
            _impact[Qname] = {
1✔
639
                "S": charac_data[Qname].dot(io.__dict__[Qsheets[Qname]].S.values),
640
                "F_Y": charac_data[Qname].dot(_F_Y),
641
                "unit": _unit[Qname],
642
            }
643

644
        impact["S"] = pd.concat(
1✔
645
            [
646
                _impact["Q_factorinputs"]["S"],
647
                _impact["Q_emission"]["S"],
648
                _impact["Q_materials"]["S"],
649
                _impact["Q_resources"]["S"],
650
            ]
651
        )
652
        impact["F_Y"] = pd.concat(
1✔
653
            [
654
                _impact["Q_factorinputs"]["F_Y"],
655
                _impact["Q_emission"]["F_Y"],
656
                _impact["Q_materials"]["F_Y"],
657
                _impact["Q_resources"]["F_Y"],
658
            ]
659
        )
660
        impact["S"].columns = io.emissions.S.columns
1✔
661
        impact["F_Y"].columns = io.emissions.F_Y.columns
1✔
662
        impact["uunit"] = pd.concat(
1✔
663
            [
664
                _impact["Q_factorinputs"]["unit"],
665
                _impact["Q_emission"]["unit"],
666
                _impact["Q_materials"]["unit"],
667
                _impact["Q_resources"]["unit"],
668
            ]
669
        )
670
        impact["name"] = "impact"
1✔
671
        io.impact = Extension(**impact)
1✔
672

673
    if popvector == "exio2":
1✔
674
        logging.debug("Read population vector")
×
675
        io.population = pd.read_csv(
×
676
            os.path.join(PYMRIO_PATH["exio2"], "misc", "population.txt"),
677
            index_col=0,
678
            sep="\t",
679
        ).astype(float)
680
    else:
681
        io.population = popvector
1✔
682

683
    return io
1✔
684

685

686
def parse_exiobase3(path):
1✔
687
    """Parses the public EXIOBASE 3 system
688

689
    This parser works with either the compressed zip
690
    archive as downloaded or the extracted system.
691

692
    Note
693
    ----
694
    The exiobase 3 parser does so far not include
695
    population and characterization data.
696

697
    Parameters
698
    ----------
699

700
    path : string or pathlib.Path
701
        Path to the folder with the EXIOBASE files
702
        or the compressed archive.
703

704
    Returns
705
    -------
706
    IOSystem
707
        A IOSystem with the parsed exiobase 3 data
708

709
    """
710
    io = load_all(path)
1✔
711
    # need to rename the final demand satellite,
712
    # wrong name in the standard distribution
713
    try:
1✔
714
        io.satellite.F_Y = io.satellite.F_hh.copy()
1✔
715
        del io.satellite.F_hh
1✔
716
    except AttributeError:
×
717
        pass
×
718

719
    # some ixi in the exiobase 3.4 official distribution
720
    # have a country name mixup. Clean it here:
721
    io.rename_regions(
1✔
722
        {
723
            "AUS": "AU",
724
            "AUT": "AT",
725
            "BEL": "BE",
726
            "BGR": "BG",
727
            "BRA": "BR",
728
            "CAN": "CA",
729
            "CHE": "CH",
730
            "CHN": "CN",
731
            "CYP": "CY",
732
            "CZE": "CZ",
733
            "DEU": "DE",
734
            "DNK": "DK",
735
            "ESP": "ES",
736
            "EST": "EE",
737
            "FIN": "FI",
738
            "FRA": "FR",
739
            "GBR": "GB",
740
            "GRC": "GR",
741
            "HRV": "HR",
742
            "HUN": "HU",
743
            "IDN": "ID",
744
            "IND": "IN",
745
            "IRL": "IE",
746
            "ITA": "IT",
747
            "JPN": "JP",
748
            "KOR": "KR",
749
            "LTU": "LT",
750
            "LUX": "LU",
751
            "LVA": "LV",
752
            "MEX": "MX",
753
            "MLT": "MT",
754
            "NLD": "NL",
755
            "NOR": "NO",
756
            "POL": "PL",
757
            "PRT": "PT",
758
            "ROM": "RO",
759
            "RUS": "RU",
760
            "SVK": "SK",
761
            "SVN": "SI",
762
            "SWE": "SE",
763
            "TUR": "TR",
764
            "TWN": "TW",
765
            "USA": "US",
766
            "ZAF": "ZA",
767
            "WWA": "WA",
768
            "WWE": "WE",
769
            "WWF": "WF",
770
            "WWL": "WL",
771
            "WWM": "WM",
772
        }
773
    )
774

775
    return io
1✔
776

777

778
def parse_wiod(path, year=None, names=("isic", "c_codes"), popvector=None):
1✔
779
    """Parse the wiod source files for the IOSystem
780

781
    WIOD provides the MRIO tables in excel - format (xlsx) at
782
    http://www.wiod.org/new_site/database/wiots.htm (release November 2013).
783
    To use WIOD in pymrio these (for the year of analysis) must be downloaded.
784
    The interindustry matrix of these files gets parsed in IOSystem.Z, the
785
    additional information is included as factor_input extension (value
786
    added,...)
787

788
    The folder with these xslx must than be passed to the WIOD parsing
789
    function. This folder may contain folders with the extension data. Every
790
    folder within the wiod root folder will be parsed for extension data and
791
    will be added to the IOSystem. The WIOD database offers the download of
792
    the environmental extensions as zip files. These can be read directly by
793
    the parser. In case a zip file and a folder with the same name are
794
    available, the data is read from the folder. If the zip files are
795
    extracted into folder, the folders must have the same name as the
796
    corresponding zip file (without the 'zip' extension).
797

798
    If a WIOD SEA file is present (at the root of path or in a folder named
799
    'SEA' - only one file!), the labor data of this file gets included in the
800
    factor_input extension (calculated for the the three skill levels
801
    available). The monetary data in this file is not added because it is only
802
    given in national currency.
803

804
    Since the "World Input-Output Tables in previous years' prices" are still
805
    under construction (20141129), no parser for these is provided.
806

807
    Some of the meta-parameter of the IOSystem are set automatically based on
808
    the values given in the first four cells and the name of the WIOD data
809
    files (base year, version, price, iosystem).
810
    These can be overwritten afterwards if needed.
811

812
    Parameters
813
    ----------
814
    path : string or pathlib.Path
815
        Path to the folder with the WIOD source files. In case that the path
816
        to a specific file is given, only this will be parsed irrespective of
817
        the values given in year.
818
    year : int or str
819
        Which year in the path should be parsed. The years can be given with
820
        four or two digits (eg [2012 or 12]). If the given path contains a
821
        specific file, the value of year will not be used (but inferred from
822
        the meta data)- otherwise it must be given For the monetary data the
823
        parser searches for files with 'wiot - two digit year'.
824
    names : string or tuple, optional
825
        WIOD provides three different sector/final demand categories naming
826
        schemes. These can can be specified for the IOSystem. Pass:
827

828
            1) 'isic': ISIC rev 3 Codes - available for interindustry flows
829
               and final demand rows.
830
            2) 'full': Full names - available for final demand rows and
831
               final demand columns (categories) and interindustry flows.
832
            3) 'c_codes' : WIOD specific sector numbers, available for final
833
               demand rows and columns (categories) and interindustry flows.
834

835
        Internally, the parser relies on 1) for the interindustry flows and 3)
836
        for the final demand categories. This is the default and will also be
837
        used if just 'isic' gets passed ('c_codes' also replace 'isic' if this
838
        was passed for final demand categories). To specify different finial
839
        consumption category names, pass a tuple with (sectors/interindustry
840
        classification, fd categories), eg ('isic', 'full'). Names are case
841
        insensitive and passing the first character is sufficient.
842
    TODO popvector : TO BE IMPLEMENTED (consistent with EXIOBASE)
843

844
    Returns
845
    -------
846
    IOSystem
847

848
    Raises
849
    ------
850
    ParserError
851
        If the WIOD source file are not complete or inconsistent
852

853
    """
854

855
    # Path manipulation, should work cross platform
856
    path = os.path.abspath(os.path.normpath(str(path)))
1✔
857

858
    # wiot start and end
859
    wiot_ext = ".xlsx"
1✔
860
    wiot_start = "wiot"
1✔
861

862
    # determine which wiod file to be parsed
863
    if not os.path.isdir(path):
1✔
864
        # 1. case - one file specified in path
865
        if os.path.isfile(path):
1✔
866
            wiot_file = path
1✔
867
        else:
868
            # just in case the ending was forgotten
869
            wiot_file = path + wiot_ext
×
870
    else:
871
        # 2. case: directory given-build wiot_file with the value given in year
872
        if not year:
1✔
873
            raise ParserError(
×
874
                "No year specified "
875
                "(either specify a specific file "
876
                "or a path and year)"
877
            )
878
        year_two_digit = str(year)[-2:]
1✔
879
        wiot_file_list = [
1✔
880
            fl
881
            for fl in os.listdir(path)
882
            if (
883
                fl[:6] == wiot_start + year_two_digit
884
                and os.path.splitext(fl)[1] == wiot_ext
885
            )
886
        ]
887
        if len(wiot_file_list) != 1:
1✔
888
            raise ParserError(
×
889
                "Multiple files for a given year or file not "
890
                "found (specify a specific file in paramters)"
891
            )
892

893
        wiot_file = os.path.join(path, wiot_file_list[0])
1✔
894

895
    wiot_file = wiot_file
1✔
896
    root_path = os.path.split(wiot_file)[0]
1✔
897
    if not os.path.exists(wiot_file):
1✔
898
        raise ParserError("WIOD file not found in the specified folder.")
×
899

900
    meta_rec = MRIOMetaData(location=root_path)
1✔
901

902
    # wiot file structure
903
    wiot_meta = {
1✔
904
        "col": 0,  # column of the meta information
905
        "year": 0,  # rest: rows with the data
906
        "iosystem": 2,
907
        "unit": 3,
908
        "end_row": 4,
909
    }
910
    wiot_header = {
1✔
911
        # the header indexes are the same for rows after removing the first
912
        # two lines (wiot_empty_top_rows)
913
        "code": 0,
914
        "sector_names": 1,
915
        "region": 2,
916
        "c_code": 3,
917
    }
918
    wiot_empty_top_rows = [0, 1]
1✔
919

920
    wiot_marks = {  # special marks
1✔
921
        "last_interindsec": "c35",  # last sector of the interindustry
922
        "tot_facinp": ["r60", "r69"],  # useless totals to remove from factinp
923
        "total_column": [-1],  # the total column in the whole data
924
    }
925

926
    wiot_sheet = 0  # assume the first one is the one with the data.
1✔
927

928
    # Wiod has an unfortunate file structure with overlapping metadata and
929
    # header. In order to deal with that first the full file is read.
930
    wiot_data = pd.read_excel(wiot_file, sheet_name=wiot_sheet, header=None)
1✔
931

932
    meta_rec._add_fileio("WIOD data parsed from {}".format(wiot_file))
1✔
933
    # get meta data
934
    wiot_year = wiot_data.iloc[wiot_meta["year"], wiot_meta["col"]][-4:]
1✔
935
    wiot_iosystem = (
1✔
936
        wiot_data.iloc[wiot_meta["iosystem"], wiot_meta["col"]].rstrip(")").lstrip("(")
937
    )
938
    meta_rec.change_meta("system", wiot_iosystem)
1✔
939
    _wiot_unit = (
1✔
940
        wiot_data.iloc[wiot_meta["unit"], wiot_meta["col"]].rstrip(")").lstrip("(")
941
    )
942

943
    # remove meta data, empty rows, total column
944
    wiot_data.iloc[0 : wiot_meta["end_row"], wiot_meta["col"]] = np.NaN
1✔
945
    wiot_data.drop(wiot_empty_top_rows, axis=0, inplace=True)
1✔
946
    wiot_data.drop(wiot_data.columns[wiot_marks["total_column"]], axis=1, inplace=True)
1✔
947
    # at this stage row and column header should have the same size but
948
    # the index starts now at two - replace/reset to row numbers
949
    wiot_data.index = range(wiot_data.shape[0])
1✔
950

951
    # Early years in WIOD tables have a different name for Romania:
952
    # 'ROM' which should be 'ROU'. The latter is also consistent with
953
    # the environmental extensions names.
954
    wiot_data.iloc[wiot_header["region"], :] = wiot_data.iloc[
1✔
955
        wiot_header["region"], :
956
    ].str.replace("ROM", "ROU", regex=False)
957
    wiot_data.iloc[:, wiot_header["region"]] = wiot_data.iloc[
1✔
958
        :, wiot_header["region"]
959
    ].str.replace("ROM", "ROU", regex=False)
960

961
    # get the end of the interindustry matrix
962
    _lastZcol = wiot_data[
1✔
963
        wiot_data.iloc[:, wiot_header["c_code"]] == wiot_marks["last_interindsec"]
964
    ].index[-1]
965
    _lastZrow = wiot_data[
1✔
966
        wiot_data[wiot_header["c_code"]] == wiot_marks["last_interindsec"]
967
    ].index[-1]
968

969
    if _lastZcol != _lastZrow:
1✔
970
        raise ParserError("Interindustry matrix not symetric in the WIOD source file")
×
971
    else:
972
        Zshape = (_lastZrow, _lastZcol)
1✔
973

974
    # separate factor input extension and remove
975
    # totals in the first and last row
976
    facinp = wiot_data.iloc[Zshape[0] + 1 :, :]
1✔
977
    facinp = facinp.drop(
1✔
978
        facinp[facinp[wiot_header["c_code"]].isin(wiot_marks["tot_facinp"])].index,
979
        axis=0,
980
    )
981

982
    Z = wiot_data.iloc[: Zshape[0] + 1, : Zshape[1] + 1].copy()
1✔
983
    Y = wiot_data.iloc[: Zshape[0] + 1, Zshape[1] + 1 :].copy()
1✔
984
    F_fac = facinp.iloc[:, : Zshape[1] + 1].copy()
1✔
985
    F_Y_fac = facinp.iloc[:, Zshape[1] + 1 :].copy()
1✔
986

987
    index_wiot_headers = [nr for nr in wiot_header.values()]
1✔
988
    # Save lookup of sectors and codes - to be used at the end of the parser
989
    # Assuming USA is present in every WIOT year
990
    wiot_sector_lookup = (
1✔
991
        wiot_data[wiot_data[wiot_header["region"]] == "USA"]
992
        .iloc[:, 0 : max(index_wiot_headers) + 1]
993
        .applymap(str)
994
    )
995
    wiot_sector_lookup.columns = [
1✔
996
        entry[1] for entry in sorted(zip(wiot_header.values(), wiot_header.keys()))
997
    ]
998
    wiot_sector_lookup.set_index("code", inplace=True, drop=False)
1✔
999
    _Y = Y.T.iloc[
1✔
1000
        :,
1001
        [
1002
            wiot_header["code"],  # Included to be consistent with  wiot_header
1003
            wiot_header["sector_names"],
1004
            wiot_header["region"],
1005
            wiot_header["c_code"],
1006
        ],
1007
    ]
1008
    wiot_fd_lookup = _Y[_Y.iloc[:, wiot_header["region"]] == "USA"].applymap(str)
1✔
1009
    wiot_fd_lookup.columns = [
1✔
1010
        entry[1] for entry in sorted(zip(wiot_header.values(), wiot_header.keys()))
1011
    ]
1012
    wiot_fd_lookup.set_index("c_code", inplace=True, drop=False)
1✔
1013
    wiot_fd_lookup.index.name = "code"
1✔
1014

1015
    # set the index/columns, work with code b/c these are also used in the
1016
    # extensions
1017
    Z[wiot_header["code"]] = Z[wiot_header["code"]].astype(str)
1✔
1018
    Z.set_index([wiot_header["region"], wiot_header["code"]], inplace=True, drop=False)
1✔
1019
    Z = Z.iloc[max(index_wiot_headers) + 1 :, max(index_wiot_headers) + 1 :]
1✔
1020
    Z.index.names = IDX_NAMES["Z_col"]
1✔
1021
    Z.columns = Z.index
1✔
1022

1023
    indexY_col_head = Y.iloc[[wiot_header["region"], wiot_header["c_code"]], :]
1✔
1024
    Y.columns = pd.MultiIndex.from_arrays(
1✔
1025
        indexY_col_head.values, names=IDX_NAMES["Y_col2"]
1026
    )
1027
    Y = Y.iloc[max(index_wiot_headers) + 1 :, :]
1✔
1028
    Y.index = Z.index
1✔
1029

1030
    F_fac.set_index(
1✔
1031
        [wiot_header["sector_names"]], inplace=True, drop=False
1032
    )  # c_code missing, use names
1033
    F_fac.index.names = ["inputtype"]
1✔
1034
    F_fac = F_fac.iloc[:, max(index_wiot_headers) + 1 :]
1✔
1035
    F_fac.columns = Z.columns
1✔
1036
    F_Y_fac.columns = Y.columns
1✔
1037
    F_Y_fac.index = F_fac.index
1✔
1038

1039
    # convert from object to float (was object because mixed float,str)
1040
    Z = Z.astype("float")
1✔
1041
    Y = Y.astype("float")
1✔
1042
    F_fac = F_fac.astype("float")
1✔
1043
    F_Y_fac = F_Y_fac.astype("float")
1✔
1044

1045
    # save the units
1046
    Z_unit = pd.DataFrame(Z.iloc[:, 0])
1✔
1047
    Z_unit.columns = ["unit"]
1✔
1048
    Z_unit["unit"] = _wiot_unit
1✔
1049

1050
    F_fac_unit = pd.DataFrame(F_fac.iloc[:, 0])
1✔
1051
    F_fac_unit.columns = ["unit"]
1✔
1052
    F_fac_unit["unit"] = _wiot_unit
1✔
1053

1054
    ll_countries = list(Z.index.get_level_values("region").unique())
1✔
1055

1056
    # Finalize the factor inputs extension
1057
    ext = dict()
1✔
1058

1059
    ext["factor_inputs"] = {
1✔
1060
        "F": F_fac,
1061
        "F_Y": F_Y_fac,
1062
        "year": wiot_year,
1063
        "iosystem": wiot_iosystem,
1064
        "unit": F_fac_unit,
1065
        "name": "factor input",
1066
    }
1067

1068
    # SEA extension
1069
    _F_sea_data, _F_sea_unit = __get_WIOD_SEA_extension(
1✔
1070
        root_path=root_path, year=wiot_year
1071
    )
1072
    if _F_sea_data is not None:
1✔
1073
        # None if no SEA file present
1074
        _F_Y_sea = pd.DataFrame(
1✔
1075
            index=_F_sea_data.index, columns=F_Y_fac.columns, data=0
1076
        )
1077
        _F_Y_sea = _F_Y_sea.astype("float")
1✔
1078

1079
        ext["SEA"] = {
1✔
1080
            "F": _F_sea_data,
1081
            "F_Y": _F_Y_sea,
1082
            "year": wiot_year,
1083
            "iosystem": wiot_iosystem,
1084
            "unit": _F_sea_unit,
1085
            "name": "SEA",
1086
        }
1087
        meta_rec._add_fileio("SEA file extension parsed from {}".format(root_path))
1✔
1088

1089
    # Environmental extensions, names follow the name given
1090
    # in the meta sheet (except for CO2 to get a better description).
1091
    # Units are hardcoded if no consistent place to read them
1092
    # within the files (for all extensions in upper case).
1093
    # The units names must exactly match!
1094
    # Start must identify exactly one folder or a zip file to
1095
    # read the extension.
1096
    # Within the folder, the routine looks for xls files
1097
    # starting with the country code.
1098
    dl_envext_para = {
1✔
1099
        "AIR": {
1100
            "name": "Air Emission Accounts",
1101
            "start": "AIR_",
1102
            "ext": ".xls",
1103
            "unit": {
1104
                "CO2": "Gg",
1105
                "CH4": "t",
1106
                "N2O": "t",
1107
                "NOx": "t",
1108
                "SOx": "t",
1109
                "CO": "t",
1110
                "NMVOC": "t",
1111
                "NH3": "t",
1112
            },
1113
        },
1114
        "CO2": {
1115
            "name": "CO2 emissions - per source",
1116
            "start": "CO2_",
1117
            "ext": ".xls",
1118
            "unit": {"all": "Gg"},
1119
        },
1120
        "EM": {
1121
            "name": "Emission relevant energy use",
1122
            "start": "EM_",
1123
            "ext": ".xls",
1124
            "unit": {"all": "TJ"},
1125
        },
1126
        "EU": {
1127
            "name": "Gross energy use",
1128
            "start": "EU_",
1129
            "ext": ".xls",
1130
            "unit": {"all": "TJ"},
1131
        },
1132
        "lan": {
1133
            "name": "land use",
1134
            "start": "lan_",
1135
            "ext": ".xls",
1136
            "unit": {"all": None},
1137
        },
1138
        "mat": {
1139
            "name": "material use",
1140
            "start": "mat_",
1141
            "ext": ".xls",
1142
            "unit": {"all": None},
1143
        },
1144
        "wat": {
1145
            "name": "water use",
1146
            "start": "wat_",
1147
            "ext": ".xls",
1148
            "unit": {"all": None},
1149
        },
1150
    }
1151

1152
    _F_Y_template = pd.DataFrame(columns=F_Y_fac.columns)
1✔
1153
    _ss_F_Y_pressure_column = "c37"
1✔
1154
    for ik_ext in dl_envext_para:
1✔
1155
        _dl_ex = __get_WIOD_env_extension(
1✔
1156
            root_path=root_path,
1157
            year=wiot_year,
1158
            ll_co=ll_countries,
1159
            para=dl_envext_para[ik_ext],
1160
        )
1161
        if _dl_ex is not None:
1✔
1162
            # None if extension not available
1163
            _F_Y = _dl_ex["F_Y"]
1✔
1164

1165
            _F_Y.columns = pd.MultiIndex.from_product(
1✔
1166
                [_F_Y.columns, [_ss_F_Y_pressure_column]]
1167
            )
1168
            _F_Y = pd.concat([_F_Y_template, _F_Y])
1✔
1169
            _F_Y.fillna(0, inplace=True)
1✔
1170
            _F_Y.index.names = _dl_ex["F"].index.names
1✔
1171
            _F_Y.columns.names = _F_Y_template.columns.names
1✔
1172
            _F_Y = _F_Y[ll_countries]
1✔
1173
            _F_Y = _F_Y.astype("float")
1✔
1174

1175
            ext[ik_ext] = {
1✔
1176
                "F": _dl_ex["F"],
1177
                "F_Y": _F_Y,
1178
                "year": wiot_year,
1179
                "iosystem": wiot_iosystem,
1180
                "unit": _dl_ex["unit"],
1181
                "name": dl_envext_para[ik_ext]["name"],
1182
            }
1183
            meta_rec._add_fileio(
1✔
1184
                "Extension {} parsed from {}".format(ik_ext, root_path)
1185
            )
1186

1187
    # Build system
1188
    wiod = IOSystem(Z=Z, Y=Y, unit=Z_unit, meta=meta_rec, **ext)
1✔
1189

1190
    # Replace sector/final demand category names
1191
    if type(names) is str:
1✔
1192
        names = (names, names)
×
1193
    ll_names = [w[0].lower() for w in names]
1✔
1194

1195
    if ll_names[0] == "c":
1✔
1196
        dd_sec_rename = wiot_sector_lookup.c_code.to_dict()
×
1197
    elif ll_names[0] == "i":
1✔
1198
        dd_sec_rename = wiot_sector_lookup.code.to_dict()
1✔
1199
    elif ll_names[0] == "f":
×
1200
        dd_sec_rename = wiot_sector_lookup.sector_names.to_dict()
×
1201
    else:
1202
        dd_sec_rename = wiot_sector_lookup.code.to_dict()
×
1203
        warnings.warn(
×
1204
            "Parameter for names not understood - " "used ISIC codes as sector names"
1205
        )
1206

1207
    if ll_names[1] == "c":
1✔
1208
        dd_fd_rename = wiot_fd_lookup.c_code.to_dict()
1✔
1209
    elif ll_names[1] == "i":
×
1210
        dd_fd_rename = wiot_fd_lookup.c_code.to_dict()
×
1211
    elif ll_names[1] == "f":
×
1212
        dd_fd_rename = wiot_fd_lookup.sector_names.to_dict()
×
1213
    else:
1214
        warnings.warn(
×
1215
            "Parameter for names not understood - "
1216
            "used c_codes as final demand category names"
1217
        )
1218

1219
    wiod.Z.rename(columns=dd_sec_rename, index=dd_sec_rename, inplace=True)
1✔
1220
    wiod.Y.rename(columns=dd_fd_rename, index=dd_sec_rename, inplace=True)
1✔
1221
    for ext in wiod.get_extensions(data=True):
1✔
1222
        ext.F.rename(columns=dd_sec_rename, inplace=True)
1✔
1223
        ext.F_Y.rename(columns=dd_fd_rename, inplace=True)
1✔
1224

1225
    return wiod
1✔
1226

1227

1228
def __get_WIOD_env_extension(root_path, year, ll_co, para):
1✔
1229
    """Parses the wiod environmental extension
1230

1231
    Extension can either be given as original .zip files or as extracted
1232
    data in a folder with the same name as the corresponding zip file (with-
1233
    out the extension).
1234

1235
    This function is based on the structure of the extensions from _may12.
1236

1237
    Note
1238
    ----
1239
    The function deletes 'secQ' which is not present in the economic tables.
1240

1241
    Parameters
1242
    ----------
1243
    root_path : string
1244
        Path to the WIOD data or the path with the
1245
        extension data folder or zip file.
1246
    year : str or int
1247
        Year to return for the extension = valid sheetname for the xls file.
1248
    ll_co : list like
1249
        List of countries in WIOD - used for finding and matching
1250
        extension data in the given folder.
1251
    para : dict
1252
        Defining the parameters for reading the extension.
1253

1254
    Returns
1255
    -------
1256
    dict with keys
1257
        F : pd.DataFrame with index 'stressor' and columns 'region', 'sector'
1258
        F_Y : pd.Dataframe with index 'stressor' and column 'region'
1259
            This data is for household stressors - must be applied to the right
1260
            final demand column afterwards.
1261
        unit : pd.DataFrame with index 'stressor' and column 'unit'
1262

1263

1264
    """
1265

1266
    ll_root_content = [
1✔
1267
        ff for ff in os.listdir(root_path) if ff.startswith(para["start"])
1268
    ]
1269
    if len(ll_root_content) < 1:
1✔
1270
        warnings.warn(
1✔
1271
            "Extension data for {} not found - "
1272
            "Extension not included".format(para["start"]),
1273
            ParserWarning,
1274
        )
1275
        return None
1✔
1276

1277
    elif len(ll_root_content) > 1:
1✔
1278
        raise ParserError(
×
1279
            "Several raw data for extension"
1280
            "{} available - clean extension folder.".format(para["start"])
1281
        )
1282

1283
    pf_env = os.path.join(root_path, ll_root_content[0])
1✔
1284

1285
    if pf_env.endswith(".zip"):
1✔
1286
        rf_zip = zipfile.ZipFile(pf_env)
1✔
1287
        ll_env_content = [ff for ff in rf_zip.namelist() if ff.endswith(para["ext"])]
1✔
1288
    else:
1289
        ll_env_content = [ff for ff in os.listdir(pf_env) if ff.endswith(para["ext"])]
1✔
1290

1291
    dl_env = dict()
1✔
1292
    dl_env_hh = dict()
1✔
1293
    for co in ll_co:
1✔
1294
        ll_pff_read = [
1✔
1295
            ff
1296
            for ff in ll_env_content
1297
            if ff.endswith(para["ext"])
1298
            and (ff.startswith(co.upper()) or ff.startswith(co.lower()))
1299
        ]
1300

1301
        if len(ll_pff_read) < 1:
1✔
1302
            raise ParserError(
×
1303
                "Country data not complete for Extension "
1304
                "{} - missing {}.".format(para["start"], co)
1305
            )
1306

1307
        elif len(ll_pff_read) > 1:
1✔
1308
            raise ParserError(
×
1309
                "Multiple country data for Extension "
1310
                "{} - country {}.".format(para["start"], co)
1311
            )
1312

1313
        pff_read = ll_pff_read[0]
1✔
1314

1315
        if pf_env.endswith(".zip"):
1✔
1316
            ff_excel = pd.ExcelFile(rf_zip.open(pff_read))
1✔
1317
        else:
1318
            ff_excel = pd.ExcelFile(os.path.join(pf_env, pff_read))
1✔
1319
        if str(year) in ff_excel.sheet_names:
1✔
1320
            df_env = ff_excel.parse(sheet_name=str(year), index_col=None, header=0)
1✔
1321
        else:
1322
            warnings.warn(
×
1323
                "Extension {} does not include"
1324
                "data for the year {} - "
1325
                "Extension not included".format(para["start"], year),
1326
                ParserWarning,
1327
            )
1328
            return None
×
1329

1330
        if not df_env.index.is_numeric():
1✔
1331
            # upper case letter extensions gets parsed with multiindex, not
1332
            # quite sure why...
1333
            df_env.reset_index(inplace=True)
×
1334

1335
        # unit can be taken from the first cell in the excel sheet
1336
        if df_env.columns[0] != "level_0":
1✔
1337
            para["unit"]["all"] = df_env.columns[0]
1✔
1338

1339
        # two clean up cases - can be identified by lower/upper case extension
1340
        # description
1341
        if para["start"].islower():
1✔
1342
            pass
1✔
1343
        elif para["start"].isupper():
1✔
1344
            df_env = df_env.iloc[:, 1:]
1✔
1345
        else:
1346
            raise ParserError("Format of extension not given.")
×
1347

1348
        df_env.dropna(axis=0, how="all", inplace=True)
1✔
1349
        df_env = df_env[df_env.iloc[:, 0] != "total"]
1✔
1350
        df_env = df_env[df_env.iloc[:, 0] != "secTOT"]
1✔
1351
        df_env = df_env[df_env.iloc[:, 0] != "secQ"]
1✔
1352
        df_env.iloc[:, 0] = df_env.iloc[:, 0].astype(str)
1✔
1353
        df_env.iloc[:, 0].replace(to_replace="sec", value="", regex=True, inplace=True)
1✔
1354

1355
        df_env.set_index([df_env.columns[0]], inplace=True)
1✔
1356
        df_env.index.names = ["sector"]
1✔
1357
        df_env = df_env.T
1✔
1358

1359
        ikc_hh = "FC_HH"
1✔
1360
        dl_env_hh[co] = df_env[ikc_hh]
1✔
1361
        del df_env[ikc_hh]
1✔
1362
        dl_env[co] = df_env
1✔
1363

1364
    df_F = pd.concat(dl_env, axis=1)[ll_co]
1✔
1365
    df_F_Y = pd.concat(dl_env_hh, axis=1)[ll_co]
1✔
1366
    df_F.fillna(0, inplace=True)
1✔
1367
    df_F_Y.fillna(0, inplace=True)
1✔
1368

1369
    df_F.columns.names = IDX_NAMES["F_col"]
1✔
1370
    df_F.index.names = IDX_NAMES["F_row_single"]
1✔
1371

1372
    df_F_Y.columns.names = IDX_NAMES["Y_col1"]
1✔
1373
    df_F_Y.index.names = IDX_NAMES["F_row_single"]
1✔
1374

1375
    # build the unit df
1376
    df_unit = pd.DataFrame(index=df_F.index, columns=["unit"])
1✔
1377
    _ss_unit = para["unit"].get("all", "undef")
1✔
1378
    for ikr in df_unit.index:
1✔
1379
        df_unit.loc[ikr, "unit"] = para["unit"].get(ikr, _ss_unit)
1✔
1380

1381
    df_unit.columns.names = ["unit"]
1✔
1382
    df_unit.index.names = ["stressor"]
1✔
1383

1384
    if pf_env.endswith(".zip"):
1✔
1385
        rf_zip.close()
1✔
1386

1387
    return {"F": df_F, "F_Y": df_F_Y, "unit": df_unit}
1✔
1388

1389

1390
def __get_WIOD_SEA_extension(root_path, year, data_sheet="DATA"):
1✔
1391
    """Utility function to get the extension data from the SEA file in WIOD
1392

1393
    This function is based on the structure in the WIOD_SEA_July14 file.
1394
    Missing values are set to zero.
1395

1396
    The function works if the SEA file is either in path or in a subfolder
1397
    named 'SEA'.
1398

1399
    Parameters
1400
    ----------
1401
    root_path : string
1402
        Path to the WIOD data or the path with the SEA data.
1403
    year : str or int
1404
        Year to return for the extension
1405
    sea_data_sheet : string, optional
1406
        Worksheet with the SEA data in the excel file
1407

1408
    Returns
1409
    -------
1410
    SEA data as extension for the WIOD MRIO
1411
    """
1412
    sea_ext = ".xlsx"
1✔
1413
    sea_start = "WIOD_SEA"
1✔
1414

1415
    _SEA_folder = os.path.join(root_path, "SEA")
1✔
1416
    if not os.path.exists(_SEA_folder):
1✔
1417
        _SEA_folder = root_path
1✔
1418

1419
    sea_folder_content = [
1✔
1420
        ff
1421
        for ff in os.listdir(_SEA_folder)
1422
        if os.path.splitext(ff)[-1] == sea_ext and ff[:8] == sea_start
1423
    ]
1424

1425
    if sea_folder_content:
1✔
1426
        # read data
1427
        sea_file = os.path.join(_SEA_folder, sorted(sea_folder_content)[0])
1✔
1428

1429
        df_sea = pd.read_excel(
1✔
1430
            sea_file, sheet_name=data_sheet, header=0, index_col=[0, 1, 2, 3]
1431
        )
1432

1433
        # fix years
1434
        ic_sea = df_sea.columns.tolist()
1✔
1435
        ic_sea = [yystr.lstrip("_") for yystr in ic_sea]
1✔
1436
        df_sea.columns = ic_sea
1✔
1437

1438
        try:
1✔
1439
            ds_sea = df_sea[str(year)]
1✔
1440
        except KeyError:
×
1441
            warnings.warn(
×
1442
                "SEA extension does not include data for the "
1443
                "year {} - SEA-Extension not included".format(year),
1444
                ParserWarning,
1445
            )
1446
            return None, None
×
1447

1448
        # get useful data (employment)
1449
        mt_sea = ["EMP", "EMPE", "H_EMP", "H_EMPE"]
1✔
1450
        ds_use_sea = pd.concat(
1✔
1451
            [ds_sea.xs(key=vari, level="Variable", drop_level=False) for vari in mt_sea]
1452
        )
1453
        ds_use_sea.drop(labels="TOT", level="Code", inplace=True)
1✔
1454
        ds_use_sea.reset_index("Description", drop=True, inplace=True)
1✔
1455

1456
        # RoW not included in SEA but needed to get it consistent for
1457
        # all countries. Just add a dummy with 0 for all accounts.
1458
        if "RoW" not in ds_use_sea.index.get_level_values("Country"):
1✔
1459
            ds_RoW = ds_use_sea.xs("USA", level="Country", drop_level=False)
1✔
1460
            ds_RoW.loc[:] = 0
1✔
1461
            df_RoW = ds_RoW.reset_index()
1✔
1462
            df_RoW["Country"] = "RoW"
1✔
1463
            ds_use_sea = pd.concat([ds_use_sea.reset_index(), df_RoW]).set_index(
1✔
1464
                ["Country", "Code", "Variable"]
1465
            )
1466

1467
        ds_use_sea.fillna(value=0, inplace=True)
1✔
1468
        df_use_sea = ds_use_sea.unstack(level=["Country", "Code"])[str(year)]
1✔
1469
        df_use_sea.index.names = IDX_NAMES["VA_row_single"]
1✔
1470
        df_use_sea.columns.names = IDX_NAMES["F_col"]
1✔
1471
        df_use_sea = df_use_sea.astype("float")
1✔
1472

1473
        df_unit = pd.DataFrame(
1✔
1474
            data=[  # this data must be in the same order as mt_sea
1475
                "thousand persons",
1476
                "thousand persons",
1477
                "mill hours",
1478
                "mill hours",
1479
            ],
1480
            columns=["unit"],
1481
            index=df_use_sea.index,
1482
        )
1483

1484
        return df_use_sea, df_unit
1✔
1485
    else:
1486
        warnings.warn(
×
1487
            "SEA extension raw data file not found - " "SEA-Extension not included",
1488
            ParserWarning,
1489
        )
1490
        return None, None
×
1491

1492

1493
def parse_oecd(path, year=None):
1✔
1494
    """Parse the OECD ICIO tables
1495

1496
    This function works for both, the 2016 and 2018 release.
1497
    The OECd webpage provides the data as csv files in zip compressed
1498
    archives. This function works with both, the compressed archives
1499
    and the unpacked csv files.
1500

1501
    Note
1502
    ----
1503

1504
    I) The original OECD ICIO tables provide some disaggregation of the Mexican
1505
    and Chinese tables for the interindustry flows. The pymrio parser
1506
    automatically aggregates these into Chinese And Mexican totals. Thus, the
1507
    MX1, MX2, ..  and CN1, CN2, ... entries are aggregated into MEX and CHN.
1508

1509
    II) If a given storage folder contains both releases, the datafile
1510
    must be specified in the 'path' parameter.
1511

1512
    Parameters
1513
    ----------
1514
    path: str or pathlib.Path
1515
        Either the full path to one specific OECD ICIO file
1516
        or the path to a storage folder with several OECD files.
1517
        In the later case, a specific year needs to be specified.
1518

1519
    year: str or int, optional
1520
        Year to parse if 'path' is given as a folder.
1521
        If path points to a specific file, this parameter is not used.
1522

1523
    Returns
1524
    -------
1525
    IOSystem
1526

1527
    Raises
1528
    ------
1529
    ParserError
1530
        If the file to parse could not be definitely identified.
1531
    FileNotFoundError
1532
        If the specified data file could not be found.
1533

1534
    """
1535

1536
    path = os.path.abspath(os.path.normpath(str(path)))
1✔
1537

1538
    oecd_file_starts = ["ICIO2016_", "ICIO2018_", "ICIO2021_"]
1✔
1539

1540
    # determine which oecd file to be parsed
1541
    if not os.path.isdir(path):
1✔
1542
        # 1. case - one file specified in path
1543
        oecd_file = path
1✔
1544
        path = os.path.split(oecd_file)[0]
1✔
1545
    else:
1546
        # 2. case: dir given - build oecd_file with the value given in year
1547
        if not year:
1✔
1548
            raise ParserError(
1✔
1549
                "No year specified "
1550
                "(either specify a specific file "
1551
                "or path and year)"
1552
            )
1553

1554
        oecd_file_list = [
1✔
1555
            fl
1556
            for fl in os.listdir(path)
1557
            if (
1558
                os.path.splitext(fl)[1] in [".csv", ".CSV", ".zip"]
1559
                and os.path.splitext(fl)[0]
1560
                in [oo + str(year) for oo in oecd_file_starts]
1561
            )
1562
        ]
1563

1564
        if len(oecd_file_list) > 1:
1✔
1565
            unique_file_data = set([os.path.splitext(fl)[0] for fl in oecd_file_list])
1✔
1566

1567
            if len(unique_file_data) > 1:
1✔
1568
                raise ParserError(
×
1569
                    "Multiple files for a given year "
1570
                    "found (specify a specific file in the "
1571
                    'parameter "path")'
1572
                )
1573

1574
        elif len(oecd_file_list) == 0:
1✔
1575
            raise FileNotFoundError("No data file for the given year found")
1✔
1576

1577
        oecd_file = os.path.join(path, oecd_file_list[0])
1✔
1578

1579
    oecd_file_name = os.path.split(oecd_file)[1]
1✔
1580

1581
    try:
1✔
1582
        years = re.findall(r"\d\d\d\d", oecd_file_name)
1✔
1583
        oecd_version = "v" + years[0]
1✔
1584
        oecd_year = years[1]
1✔
1585
        meta_desc = "OECD ICIO for {}".format(oecd_year)
1✔
1586

1587
    except IndexError:
×
1588
        oecd_version = "n/a"
×
1589
        oecd_year = "n/a"
×
1590
        meta_desc = "OECD ICIO - year undefined"
×
1591

1592
    meta_rec = MRIOMetaData(
1✔
1593
        location=path,
1594
        name="OECD-ICIO",
1595
        description=meta_desc,
1596
        version=oecd_version,
1597
        system="IxI",  # base don the readme
1598
    )
1599

1600
    oecd_raw = pd.read_csv(oecd_file, sep=",", index_col=0).fillna(0)
1✔
1601
    meta_rec._add_fileio("OECD data parsed from {}".format(oecd_file))
1✔
1602

1603
    mon_unit = "Million USD"
1✔
1604

1605
    oecd_totals_col = ["TOTAL"]
1✔
1606
    oecd_totals_row = ["OUT", "OUTPUT"]
1✔
1607

1608
    oecd_raw.drop(oecd_totals_col, axis=1, errors="ignore", inplace=True)
1✔
1609
    oecd_raw.drop(oecd_totals_row, axis=0, errors="ignore", inplace=True)
1✔
1610

1611
    # Important - these must not match any country or industry name
1612
    factor_input = oecd_raw.filter(regex="VALU|TAX", axis=0)
1✔
1613
    final_demand = oecd_raw.filter(
1✔
1614
        regex="HFCE|NPISH|NPS|GGFC|GFCF|INVNT|INV|DIRP|DPABR|FD|P33|DISC", axis=1
1615
    )
1616

1617
    Z = oecd_raw.loc[
1✔
1618
        oecd_raw.index.difference(factor_input.index),
1619
        oecd_raw.columns.difference(final_demand.columns),
1620
    ]
1621
    F_factor_input = factor_input.loc[
1✔
1622
        :, factor_input.columns.difference(final_demand.columns)
1623
    ]
1624
    F_Y_factor_input = factor_input.loc[:, final_demand.columns]
1✔
1625
    Y = final_demand.loc[final_demand.index.difference(F_factor_input.index), :]
1✔
1626

1627
    Z_index = pd.MultiIndex.from_tuples(tuple(ll) for ll in Z.index.str.split("_"))
1✔
1628
    Z_columns = Z_index.copy()
1✔
1629
    Z_index.names = IDX_NAMES["Z_row"]
1✔
1630
    Z_columns.names = IDX_NAMES["Z_col"]
1✔
1631
    Z.index = Z_index
1✔
1632
    Z.columns = Z_columns
1✔
1633

1634
    _midx = []
1✔
1635
    for orig_idx in Y.columns:
1✔
1636
        entries = orig_idx.split("_")
1✔
1637
        if len(entries) == 1:
1✔
1638
            # Capturing the discrepancy column
1639
            entries = ["ALL", entries[0]]
1✔
1640
        if entries[1] in Z.index.get_level_values("region").unique():
1✔
1641
            # Fixing the reversed indexing in the 2016 ICIO version
1642
            entries = [entries[1], entries[0]]
1✔
1643
        _midx.append(tuple(entries))
1✔
1644
    Y.columns = pd.MultiIndex.from_tuples(_midx)
1✔
1645
    Y.columns.names = IDX_NAMES["Y_col2"]
1✔
1646
    Y.index = Z.index
1✔
1647

1648
    F_factor_input.columns = Z.columns
1✔
1649
    F_factor_input.index.names = IDX_NAMES["VA_row_single"]
1✔
1650
    F_Y_factor_input.columns = Y.columns
1✔
1651
    F_Y_factor_input.index = F_factor_input.index
1✔
1652

1653
    # Aggregation of CN and MX subregions
1654
    core_co_names = Z.columns.get_level_values("region").unique()
1✔
1655

1656
    agg_corr = dict(
1✔
1657
        CHN=[a for a in core_co_names if re.match(r"CN\d", a)],
1658
        MEX=[a for a in core_co_names if re.match(r"MX\d", a)],
1659
    )
1660

1661
    for co_name, agg_list in agg_corr.items():
1✔
1662
        if (co_name not in core_co_names) or (len(agg_list) == 0):
1✔
1663
            continue
1✔
1664

1665
        # DEBUG note for all below: have to assign with np values due to
1666
        # alignment issues bug in pandas,
1667
        # see https://github.com/pandas-dev/pandas/issues/10440
1668

1669
        # aggregate rows
1670
        Z.loc[co_name, :] = (
1✔
1671
            Z.loc[co_name, :] + Z.loc[agg_list, :].groupby(level="sector", axis=0).sum()
1672
        ).values
1673
        Z = Z.drop(agg_list, axis=0)
1✔
1674
        Y.loc[co_name, :] = (
1✔
1675
            Y.loc[co_name, :] + Y.loc[agg_list, :].groupby(level="sector", axis=0).sum()
1676
        ).values
1677
        Y = Y.drop(agg_list, axis=0)
1✔
1678

1679
        # aggregate columns
1680
        Z.loc[:, co_name] = (
1✔
1681
            Z.loc[:, co_name] + Z.loc[:, agg_list].groupby(level="sector", axis=1).sum()
1682
        ).values
1683
        Z = Z.drop(agg_list, axis=1)
1✔
1684

1685
        F_factor_input.loc[:, co_name] = (
1✔
1686
            F_factor_input.loc[:, co_name]
1687
            + F_factor_input.loc[:, agg_list].groupby(level="sector", axis=1).sum()
1688
        ).values
1689
        F_factor_input = F_factor_input.drop(agg_list, axis=1)
1✔
1690

1691
    # unit df generation at the end to have consistent index
1692
    unit = pd.DataFrame(index=Z.index, data=mon_unit, columns=IDX_NAMES["unit"])
1✔
1693
    F_unit = pd.DataFrame(
1✔
1694
        index=F_factor_input.index, data=mon_unit, columns=IDX_NAMES["unit"]
1695
    )
1696

1697
    oecd = IOSystem(
1✔
1698
        Z=Z,
1699
        Y=Y,
1700
        unit=unit,
1701
        meta=meta_rec,
1702
        factor_inputs={
1703
            "name": "factor_inputs",
1704
            "unit": F_unit,
1705
            "F": F_factor_input,
1706
            "F_Y": F_Y_factor_input,
1707
        },
1708
    )
1709

1710
    return oecd
1✔
1711

1712

1713
def parse_eora26(path, year=None, price="bp", country_names="eora"):
1✔
1714
    """Parse the Eora26 database
1715

1716
    Note
1717
    ----
1718

1719
    This parser deletes the statistical discrepancy columns from
1720
    the parsed Eora system (reports the amount of loss in the
1721
    meta records).
1722

1723
    Eora does not provide any information on the unit of the
1724
    monetary values. Based on personal communication the unit
1725
    is set to Mill USD manually.
1726

1727

1728
    Parameters
1729
    ----------
1730

1731
    path : string or pathlib.Path
1732
       Path to the Eora raw storage folder or a specific eora zip file to
1733
       parse.  There are several options to specify the data for parsing:
1734

1735
       1) Pass the name of Eora zip file. In this case the parameters 'year'
1736
          and 'price' will not be used
1737
       2) Pass a folder which either contains Eora zip files or unpacked Eora
1738
          data. In that case, a year must be given
1739
       3) Pass a folder which contains subfolders in the format 'YYYY', e.g.
1740
          '1998' This subfolder can either contain an Eora zip file or an
1741
          unpacked Eora system
1742

1743
    year : int or str
1744
        4 digit year spec. This will not be used if a zip file
1745
        is specified in 'path'
1746

1747
    price : str, optional
1748
        'bp' or 'pp'
1749

1750
    country_names: str, optional
1751
        Which country names to use:
1752
        'eora' = Eora flavoured ISO 3 varian
1753
        'full' = Full country names as provided by Eora
1754
        Passing the first letter suffice.
1755

1756

1757
    """
1758
    path = os.path.abspath(os.path.normpath(str(path)))
1✔
1759

1760
    if country_names[0].lower() == "e":
1✔
1761
        country_names = "eora"
1✔
1762
    elif country_names[0].lower() == "f":
1✔
1763
        country_names = "full"
1✔
1764
    else:
1765
        raise ParserError("Parameter country_names must be Eora or full")
1✔
1766

1767
    row_name = "ROW"
1✔
1768
    eora_zip_ext = ".zip"
1✔
1769
    is_zip = False
1✔
1770

1771
    # determine which eora file to be parsed
1772
    if os.path.splitext(path)[1] == eora_zip_ext:
1✔
1773
        # case direct pass of eora zipfile
1774
        year = re.search(r"\d\d\d\d", os.path.basename(path)).group(0)
×
1775
        price = re.search(r"bp|pp", os.path.basename(path)).group(0)
×
1776
        eora_loc = path
×
1777
        root_path = os.path.split(path)[0]
×
1778
        is_zip = True
×
1779
    else:
1780
        root_path = path
1✔
1781
        if str(year) in os.listdir(path):
1✔
1782
            path = os.path.join(path, str(year))
×
1783

1784
        eora_file_list = [
1✔
1785
            fl
1786
            for fl in os.listdir(path)
1787
            if os.path.splitext(fl)[1] == eora_zip_ext
1788
            and str(year) in fl
1789
            and str(price) in fl
1790
        ]
1791

1792
        if len(eora_file_list) > 1:
1✔
1793
            raise ParserError(
×
1794
                "Multiple files for a given year "
1795
                "found (specify a specific file in parameters)"
1796
            )
1797
        elif len(eora_file_list) == 1:
1✔
1798
            eora_loc = os.path.join(path, eora_file_list[0])
×
1799
            is_zip = True
×
1800
        else:
1801
            # Just a path was given, no zip file found,
1802
            # continue with only the path information - assumed an
1803
            # unpacked zip file
1804
            eora_loc = path
1✔
1805
            is_zip = False
1✔
1806

1807
    meta_rec = MRIOMetaData(location=root_path)
1✔
1808

1809
    # Eora file specs
1810
    eora_sep = "\t"
1✔
1811
    ZY_col = namedtuple("ZY", "full eora system name")(0, 1, 2, 3)
1✔
1812

1813
    eora_files = {
1✔
1814
        "Z": "Eora26_{year}_{price}_T.txt".format(year=str(year), price=price),
1815
        "Q": "Eora26_{year}_{price}_Q.txt".format(year=str(year), price=price),
1816
        "QY": "Eora26_{year}_{price}_QY.txt".format(year=str(year), price=price),
1817
        "VA": "Eora26_{year}_{price}_VA.txt".format(year=str(year), price=price),
1818
        "Y": "Eora26_{year}_{price}_FD.txt".format(year=str(year), price=price),
1819
        "labels_Z": "labels_T.txt",
1820
        "labels_Y": "labels_FD.txt",
1821
        "labels_Q": "labels_Q.txt",
1822
        "labels_VA": "labels_VA.txt",
1823
    }
1824

1825
    header = namedtuple("header", "index columns index_names, column_names")
1✔
1826

1827
    eora_header_spec = {
1✔
1828
        "Z": header(
1829
            index="labels_Z",
1830
            columns="labels_Z",
1831
            index_names=IDX_NAMES["Z_row"],
1832
            column_names=IDX_NAMES["Z_col"],
1833
        ),
1834
        "Q": header(
1835
            index="labels_Q",
1836
            columns="labels_Z",
1837
            index_names=IDX_NAMES["F_row_src"],
1838
            column_names=IDX_NAMES["F_col"],
1839
        ),
1840
        "QY": header(
1841
            index="labels_Q",
1842
            columns="labels_Y",
1843
            index_names=IDX_NAMES["F_row_src"],
1844
            column_names=IDX_NAMES["Y_col2"],
1845
        ),
1846
        "VA": header(
1847
            index="labels_VA",
1848
            columns="labels_Z",
1849
            index_names=IDX_NAMES["VA_row_unit_cat"],
1850
            column_names=IDX_NAMES["F_col"],
1851
        ),
1852
        "Y": header(
1853
            index="labels_Z",
1854
            columns="labels_Y",
1855
            index_names=IDX_NAMES["Y_row"],
1856
            column_names=IDX_NAMES["Y_col2"],
1857
        ),
1858
    }
1859

1860
    if is_zip:
1✔
1861
        zip_file = zipfile.ZipFile(eora_loc)
×
1862
        indices_file = None
×
1863
        for key, filename in eora_files.items():
×
1864
            if filename not in zip_file.namelist() and filename.startswith("labels"):
×
1865
                try:
×
1866
                    indices_loc = os.path.join(path, "indices.zip")
×
1867
                    indices_file = zipfile.ZipFile(indices_loc)
×
1868
                except:
×
1869
                    raise ValueError(
×
1870
                        f"{filename} is not available in the zip file and no indices.zip file is available in the directory provided"
1871
                    )
1872

1873
        eora_data = {
×
1874
            key: pd.read_csv(
1875
                zip_file.open(filename),
1876
                sep=eora_sep,
1877
                header=None,
1878
            )
1879
            if filename in zip_file.namelist()
1880
            else pd.read_csv(
1881
                indices_file.open(filename),
1882
                sep=eora_sep,
1883
                header=None,
1884
            )
1885
            for key, filename in eora_files.items()
1886
        }
1887
        zip_file.close()
×
1888
    else:
1889
        eora_data = {
1✔
1890
            key: pd.read_csv(
1891
                os.path.join(eora_loc, filename),
1892
                sep=eora_sep,
1893
                header=None,
1894
            )
1895
            for key, filename in eora_files.items()
1896
        }
1897
    meta_rec._add_fileio(
1✔
1898
        "Eora26 for {year}-{price} data parsed from {loc}".format(
1899
            year=year, price=price, loc=eora_loc
1900
        )
1901
    )
1902

1903
    eora_data["labels_Z"] = eora_data["labels_Z"].loc[
1✔
1904
        :, [getattr(ZY_col, country_names), ZY_col.name]
1905
    ]
1906
    eora_data["labels_Y"] = eora_data["labels_Y"].loc[
1✔
1907
        :, [getattr(ZY_col, country_names), ZY_col.name]
1908
    ]
1909
    eora_data["labels_VA"] = eora_data["labels_VA"].iloc[
1✔
1910
        :, : len(eora_header_spec["VA"].column_names)
1911
    ]
1912
    labQ = eora_data["labels_Q"].iloc[:, : len(eora_header_spec["Q"].column_names)]
1✔
1913
    labQ.columns = IDX_NAMES["F_row_src"]
1✔
1914
    Q_unit = pd.DataFrame(labQ["stressor"].str.extract(r"\((.*)\)", expand=False))
1✔
1915
    Q_unit.columns = IDX_NAMES["unit"]
1✔
1916

1917
    labQ["stressor"] = labQ["stressor"].str.replace(r"\s\((.*)\)", "", regex=True)
1✔
1918
    eora_data["labels_Q"] = labQ
1✔
1919

1920
    for key in eora_header_spec.keys():
1✔
1921
        eora_data[key].columns = (
1✔
1922
            eora_data[eora_header_spec[key].columns]
1923
            .set_index(list(eora_data[eora_header_spec[key].columns]))
1924
            .index
1925
        )
1926
        eora_data[key].columns.names = eora_header_spec[key].column_names
1✔
1927
        eora_data[key].index = (
1✔
1928
            eora_data[eora_header_spec[key].index]
1929
            .set_index(list(eora_data[eora_header_spec[key].index]))
1930
            .index
1931
        )
1932
        eora_data[key].index.names = eora_header_spec[key].index_names
1✔
1933

1934
        try:
1✔
1935
            meta_rec._add_modify(
1✔
1936
                "Remove Rest of the World ({name}) "
1937
                "row from {table} - loosing {amount}".format(
1938
                    name=row_name,
1939
                    table=key,
1940
                    amount=eora_data[key].loc[:, row_name].sum().values[0],
1941
                )
1942
            )
1943
            eora_data[key].drop(row_name, axis=1, inplace=True)
×
1944
        except KeyError:
1✔
1945
            pass
1✔
1946

1947
        try:
1✔
1948
            meta_rec._add_modify(
1✔
1949
                "Remove Rest of the World ({name}) column "
1950
                "from {table} - loosing {amount}".format(
1951
                    name=row_name,
1952
                    table=key,
1953
                    amount=eora_data[key].loc[row_name, :].sum().values[0],
1954
                )
1955
            )
1956
            eora_data[key].drop(row_name, axis=0, inplace=True)
×
1957
        except KeyError:
1✔
1958
            pass
1✔
1959

1960
    Q_unit.index = eora_data["Q"].index
1✔
1961

1962
    meta_rec.note("Set Eora moneatry units to Mill USD manually")
1✔
1963
    Z_unit = pd.DataFrame(
1✔
1964
        data=["Mill USD"] * len(eora_data["Z"].index),
1965
        index=eora_data["Z"].index,
1966
        columns=["unit"],
1967
    )
1968
    VA_unit = pd.DataFrame(
1✔
1969
        data=["Mill USD"] * len(eora_data["VA"].index),
1970
        index=eora_data["VA"].index,
1971
        columns=["unit"],
1972
    )
1973

1974
    eora = IOSystem(
1✔
1975
        Z=eora_data["Z"],
1976
        Y=eora_data["Y"],
1977
        unit=Z_unit,
1978
        Q={"name": "Q", "unit": Q_unit, "F": eora_data["Q"], "F_Y": eora_data["QY"]},
1979
        VA={
1980
            "name": "VA",
1981
            "F": eora_data["VA"],
1982
            "unit": VA_unit,
1983
        },
1984
        meta=meta_rec,
1985
    )
1986

1987
    return eora
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc