• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rero / rero-ils / 16137642231

08 Jul 2025 08:09AM UTC coverage: 92.182% (-0.006%) from 92.188%
16137642231

Pull #3877

github

web-flow
Merge f0328e268 into 43159ef46
Pull Request #3877: import: DNB, SLSP subject mef link

1 of 1 new or added line in 1 file covered. (100.0%)

3 existing lines in 1 file now uncovered.

23406 of 25391 relevant lines covered (92.18%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.18
/rero_ils/dojson/utils.py
1
# -*- coding: utf-8 -*-
2
#
3
# RERO ILS
4
# Copyright (C) 2019-2022 RERO
5
#
6
# This program is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU Affero General Public License as published by
8
# the Free Software Foundation, version 3 of the License.
9
#
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU Affero General Public License for more details.
14
#
15
# You should have received a copy of the GNU Affero General Public License
16
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17

18
"""Dojson utils."""
19

20

21
import contextlib
1✔
22
import re
1✔
23
import sys
1✔
24
import traceback
1✔
25
from copy import deepcopy
1✔
26

27
import click
1✔
28
import jsonref
1✔
29
import requests
1✔
30
import xmltodict
1✔
31
from dojson import Overdo, utils
1✔
32
from flask import current_app
1✔
33
from pkg_resources import resource_string
1✔
34

35
_UNIMARC_LANGUAGES_SCRIPTS = {
1✔
36
    "ba": "latn",  # Latin
37
    "ca": "cyrl",  # Cyrillic
38
    "da": "jpan",  # Japanese - undefined writing
39
    "db": "hani",  # Japanese - Kanji
40
    "dc": "hrkt",  # Japanese - Kana
41
    "ea": "hani",  # Chinese characters (Chinese, Japanese, Korean)
42
    "fa": "arab",  # Arabic
43
    "ga": "grek",  # Greek
44
    "ha": "hebr",  # Hebrew
45
    "ia": "thai",  # Thai
46
    "ja": "deva",  # devanagari
47
    "ka": "kore",  # Korean
48
    "la": "taml",  # Tamil
49
    "ma": "geor",  # Georgian
50
    "mb": "armn",  # Armenian
51
    "zz": "zyyy",  # other
52
}
53

54
_LANGUAGES_SCRIPTS = {
1✔
55
    "armn": ("arm",),
56
    "arab": ("ara", "per"),
57
    "cyrl": ("bel", "chu", "mac", "rus", "srp", "ukr"),
58
    "deva": (
59
        "awa",
60
        "bho",
61
        "bra",
62
        "doi",
63
        "hin",
64
        "kas",
65
        "kok",
66
        "mag",
67
        "mai",
68
        "mar",
69
        "mun",
70
        "nep",
71
        "pli",
72
        "pra",
73
        "raj",
74
        "san",
75
        "sat",
76
        "snd",
77
    ),
78
    "geor": ("geo",),
79
    "grek": ("grc", "gre"),
80
    "hani": ("chi", "jpn"),
81
    "hebr": ("heb", "lad", "yid"),
82
    "hrkt": ("jpn",),
83
    "jpan": ("jpn",),
84
    "kore": ("kor",),
85
    "taml": ("tam",),
86
    "thai": ("tha",),
87
    "zyyy": ("chi",),
88
}
89

90
_SCRIPT_PER_LANG_ASIA = {"jpn": "jpan", "kor": "kore", "chi": "hani"}
1✔
91

92
_SCRIPT_PER_LANG_NOT_ASIA = {
1✔
93
    "arm": "armn",
94
    "geo": "geor",
95
    "gre": "grek",
96
    "grc": "grek",
97
    "ara": "arab",
98
    "per": "arab",
99
    "bel": "cyrl",
100
    "rus": "cyrl",
101
    "mac": "cyrl",
102
    "srp": "cyrl",
103
    "tha": "thai",
104
    "ukr": "cyrl",
105
    "chu": "cyrl",
106
    "yid": "hebr",
107
    "heb": "hebr",
108
    "lad": "hebr",
109
    "chi": "hani",
110
}
111

112
_SCRIPT_PER_CODE = {
1✔
113
    "(S": "grek",
114
    "(3": "arab",
115
    "(B": "latn",
116
    "(N": "cyrl",
117
    "(2": "hebr",
118
}
119

120
_ILLUSTRATIVE_CONTENT_REGEXP = {
1✔
121
    "illustrations": re.compile(
122
        r"ill?(\.|\s|:|,|;|s\.|us.*)|ill$|iil|^il$|^il(\.)|"
123
        r"fig(\.|\s|,|ur|s)|fig$|abb(\.|\s|,|ild)|abb$|bild|zeichn|"
124
        r"front(\.|is|esp|\s|,|s)|front$|dessin",
125
        re.IGNORECASE,
126
    ),
127
    "maps": re.compile(
128
        r"cartes?|cartogra|cartin|cart\.|carta(\s|s)|carta$|maps?|kart", re.IGNORECASE
129
    ),
130
    "portraits": re.compile(r"port(\.|r|\s|s)|portr$|ritr", re.IGNORECASE),
131
    "graphs": re.compile(r"gra(ph|f)(\.)|^gra(ph|f)|\sgra(ph|f)|diag", re.IGNORECASE),
132
    "photographs": re.compile(r"(f|ph)oto(g|s|\s|,|typ|\.)|(f|ph)oto^", re.IGNORECASE),
133
    "facsimiles": re.compile(r"fa(c|k)", re.IGNORECASE),
134
    "coats of arms": re.compile(r"armoirie|arms|wappe|stemm", re.IGNORECASE),
135
    "genealogical tables": re.compile(r"genea|généa", re.IGNORECASE),
136
    "plans": re.compile(r"plan[^c]|plan$|piant", re.IGNORECASE),
137
    "forms": re.compile(r"form[^a|e]|modul", re.IGNORECASE),
138
    "illuminations": re.compile(r"enlum|illum|miniatur|buchmale", re.IGNORECASE),
139
    "samples": re.compile(r"sample|échant|muster|campion", re.IGNORECASE),
140
}
141

142
_PRODUCTION_METHOD_FROM_EXTENT_AND_PHYSICAL_DETAILS = {
1✔
143
    "rdapm:1001": re.compile(r"blueline", re.IGNORECASE),
144
    "rdapm:1002": re.compile(r"cyano|blaudr|bluepr", re.IGNORECASE),
145
    "rdapm:1003": re.compile(r"collot|lichtdr|(ph|f)otot", re.IGNORECASE),
146
    "rdapm:1004": re.compile(r"daguerr", re.IGNORECASE),
147
    "rdapm:1005": re.compile(r"stich|engrav|grav", re.IGNORECASE),
148
    "rdapm:1006": re.compile(r"eauforte|radier|etch", re.IGNORECASE),
149
    "rdapm:1007": re.compile(r"litho", re.IGNORECASE),
150
    "rdapm:1008": re.compile(r"(ph|f)oto[ck]o", re.IGNORECASE),
151
    "rdapm:1009": re.compile(r"photograv|fotograv|photoengrav", re.IGNORECASE),
152
    # The rdapm:1010  extraction is done only from PHYSICAL_DETAILS by the code
153
    # 'rdapm:1010': r'impr|druck|print|offset|s[ée]riegr'
154
    "rdapm:1011": re.compile(r"white print", re.IGNORECASE),
155
    "rdapm:1012": re.compile(r"grav.+?sur bois|holzschn|woodc", re.IGNORECASE),
156
    "rdapm:1014": re.compile(r"hélio|helio", re.IGNORECASE),
157
    "rdapm:1015": re.compile(r"brûl|einbren|burn", re.IGNORECASE),
158
    "rdapm:1016": re.compile(r"inscript|inscrib", re.IGNORECASE),
159
    "rdapm:1017": re.compile(r"estamp|stempel|stamping|lino", re.IGNORECASE),
160
    "rdapm:1018": re.compile(r"emboss|präg", re.IGNORECASE),
161
    "rdapm:1019": re.compile(r"point rigide|solid dot", re.IGNORECASE),
162
    "rdapm:1020": re.compile(r"thermog|schwell|swell|minolta", re.IGNORECASE),
163
    "rdapm:1021": re.compile(r"thermof|va[ck]uum|moul.+?vide", re.IGNORECASE),
164
}
165

166
_COLOR_CONTENT_REGEXP = {
1✔
167
    # monochrom
168
    "rdacc:1002": re.compile(
169
        r"noir|black|schwarz|nero|n\.\set|schw|b\&w|" r"b\/n|s\/w|^n\set\sb|\sn\set\sb",
170
        re.IGNORECASE,
171
    ),
172
    # polychrome
173
    "rdacc:1003": re.compile(r"cou?l(\.|,|eur|ou?r|\s)|cou?l$|farb", re.IGNORECASE),
174
}
175

176
_CANTON = [
1✔
177
    "ag",
178
    "ai",
179
    "ar",
180
    "be",
181
    "bl",
182
    "bs",
183
    "fr",
184
    "ge",
185
    "gl",
186
    "gr",
187
    "ju",
188
    "lu",
189
    "ne",
190
    "nw",
191
    "ow",
192
    "sg",
193
    "sh",
194
    "so",
195
    "sz",
196
    "tg",
197
    "ti",
198
    "ur",
199
    "vd",
200
    "vs",
201
    "zg",
202
    "zh",
203
]
204

205
_OBSOLETE_COUNTRIES_MAPPING = {
1✔
206
    "cn": "xxc",
207
    "err": "er",
208
    "lir": "li",
209
    "lvr": "lv",
210
    "uk": "xxk",
211
    "unr": "un",
212
    "us": "xxu",
213
    "ur": "xxr",
214
    "ys": "ye",
215
}
216

217
# field 336 mapping
218
_CONTENT_TYPE_MAPPING = {
1✔
219
    "cri": "rdaco:1002",
220
    "crm": "rdaco:1003",
221
    "crt": "rdaco:1004",
222
    "crn": "rdaco:1005",
223
    "cod": "rdaco:1007",
224
    "crd": "rdaco:1001",
225
    "crf": "rdaco:1006",
226
    "tdi": "rdaco:1023",
227
    "tdm": "rdaco:1022",
228
    "sti": "rdaco:1014",
229
    "tci": "rdaco:1015",
230
    "prm": "rdaco:1011",
231
    "ntv": "rdaco:1009",
232
    "tcn": "rdaco:1017",
233
    "tdf": "rdaco:1021",
234
    "tcf": "rdaco:1019",
235
    "ntm": "rdaco:1010",
236
    "tcm": "rdaco:1016",
237
    "cop": "rdaco:1008",
238
    "snd": "rdaco:1012",
239
    "txt": "rdaco:1020",
240
    "tct": "rdaco:1018",
241
    "spw": "rdaco:1013",
242
    "xxx": "other",
243
}
244

245
# field 337 $b and field 338 (first char of $b) mapping
246
_MEDIA_TYPE_MAPPING = {
1✔
247
    "s": "rdamt:1001",
248
    "h": "rdamt:1002",
249
    "c": "rdamt:1003",
250
    "p": "rdamt:1004",
251
    "g": "rdamt:1005",
252
    "m": "rdamt:1005",  # only in 338 (first char of $b)
253
    "e": "rdamt:1006",
254
    "n": "rdamt:1007",
255
    "v": "rdamt:1008",
256
    "x": "other",
257
    "z": "other",  # only in 338 (first char of $b)
258
}
259

260
# field 338 mapping
261
_CARRIER_TYPE_MAPPING = {
1✔
262
    "zu": "unspecified",
263
    "sg": "rdact:1002",
264
    "se": "rdact:1003",
265
    "sd": "rdact:1004",
266
    "si": "rdact:1005",
267
    "sq": "rdact:1006",
268
    "ss": "rdact:1007",
269
    "st": "rdact:1008",
270
    "sw": "rdact:1071",
271
    "sz": "other",
272
    "ha": "rdact:1021",
273
    "he": "rdact:1022",
274
    "hf": "rdact:1023",
275
    "hb": "rdact:1024",
276
    "hc": "rdact:1025",
277
    "hd": "rdact:1026",
278
    "hh": "rdact:1027",
279
    "hg": "rdact:1028",
280
    "hj": "rdact:1056",
281
    "hz": "other",
282
    "ck": "rdact:1011",
283
    "cb": "rdact:1012",
284
    "cd": "rdact:1013",
285
    "ce": "rdact:1014",
286
    "ca": "rdact:1015",
287
    "cf": "rdact:1016",
288
    "ch": "rdact:1017",
289
    "cr": "rdact:1018",
290
    "cz": "other",
291
    "pp": "rdact:1030",
292
    "pz": "other",
293
    "mc": "rdact:1032",
294
    "mf": "rdact:1033",
295
    "mr": "rdact:1034",
296
    "gd": "rdact:1035",
297
    "gf": "rdact:1036",
298
    "gc": "rdact:1037",
299
    "gt": "rdact:1039",
300
    "gs": "rdact:1040",
301
    "mo": "rdact:1069",
302
    "mz": "other",
303
    "eh": "rdact:1042",
304
    "es": "rdact:1043",
305
    "ez": "other",
306
    "no": "rdact:1045",
307
    "nn": "rdact:1046",
308
    "na": "rdact:1047",
309
    "nb": "rdact:1048",
310
    "nc": "rdact:1049",
311
    "nr": "rdact:1059",
312
    "nz": "other",
313
    "vc": "rdact:1051",
314
    "vf": "rdact:1052",
315
    "vr": "rdact:1053",
316
    "vd": "rdact:1060",
317
    "vz": "other",
318
}
319

320

321
_ENCODING_LEVEL_MAPPING = {
1✔
322
    " ": "Full level",
323
    "1": "Full level, material not examined",
324
    "2": "Less-than-full level, material not examined",
325
    "3": "Abbreviated level",
326
    "4": "Core level",
327
    "5": "Partial (preliminary) level",
328
    "7": "Minimal level",
329
    "8": "Prepublication level",
330
    "u": "Unknown",
331
    "z": "Not applicable",
332
}
333

334
_CONTRIBUTION_TAGS = [
1✔
335
    "100",
336
    "600",
337
    "610",
338
    "611",
339
    "630",
340
    "650",
341
    "651",
342
    "655",
343
    "700",
344
    "701",
345
    "702",
346
    "703",
347
    "710",
348
    "711",
349
    "712",
350
]
351

352
schema_in_bytes = resource_string(
1✔
353
    "rero_ils.jsonschemas", "common/languages-v0.0.1.json"
354
)
355
schema = jsonref.loads(schema_in_bytes.decode("utf8"))
1✔
356
_LANGUAGES = schema["language"]["enum"]
1✔
357

358
schema_in_bytes = resource_string(
1✔
359
    "rero_ils.jsonschemas", "common/countries-v0.0.1.json"
360
)
361
schema = jsonref.loads(schema_in_bytes.decode("utf8"))
1✔
362
_COUNTRIES = schema["country"]["enum"]
1✔
363

364
re_identified = re.compile(r"\((.*)\)(.*)")
1✔
365

366

367
def error_print(*args):
1✔
368
    """Error printing to sdtout."""
369
    msg = "".join(str(arg) + "\t" for arg in args)
1✔
370
    msg.strip()
1✔
371
    click.echo(msg)
1✔
372
    sys.stdout.flush()
1✔
373

374

375
def make_year(date):
1✔
376
    """Test if string is integer and between 1000 and 9999."""
377
    with contextlib.suppress(Exception):
1✔
378
        int_date = int(date)
1✔
379
        if 1000 <= int_date < 9999:
1✔
380
            return int_date
1✔
381
    return None
1✔
382

383

384
def not_repetitive(bibid, reroid, key, value, subfield, default=None):
1✔
385
    """Get the first value if the value is a list or tuple."""
386
    data = value.get(subfield, default)
1✔
387
    if isinstance(data, (list, tuple)):
1✔
388
        error_print("WARNING NOT REPETITIVE:", bibid, reroid, key, subfield, value)
1✔
389
        data = data[0]
1✔
390
    return data
1✔
391

392

393
def get_field_link_data(value):
1✔
394
    """Get field link data from subfield $6."""
395
    subfield_6 = value.get("6", "")
1✔
396
    tag_link = subfield_6.split("-")
1✔
397
    link = tag_link[1] if len(tag_link) == 2 else ""
1✔
398
    return tag_link, link
1✔
399

400

401
def get_field_items(value):
1✔
402
    """Get field items."""
403
    if isinstance(value, utils.GroupableOrderedDict):
1✔
404
        return value.iteritems(repeated=True)
1✔
405
    else:
406
        return utils.iteritems(value)
×
407

408

409
def build_string_from_subfields(value, subfield_selection, separator=" "):
1✔
410
    """Build a string parsing the selected subfields in order."""
411
    items = get_field_items(value)
1✔
412
    return separator.join(
1✔
413
        [
414
            remove_special_characters(value)
415
            for key, value in items
416
            if key in subfield_selection
417
        ]
418
    )
419

420

421
def remove_trailing_punctuation(data, punctuation=",", spaced_punctuation=":;/-"):
1✔
422
    """Remove trailing punctuation from data.
423

424
    :param data: string to process
425
    :type data: str
426
    :param punctuation: punctuation characters to be removed
427
        (preceded by a space or not)
428
    :type punctuation: str
429
    :param spaced_punctuation: punctuation characters needing
430
        one or more preceding space(s) in order to be removed.
431
    :type spaced_punctuation: str
432

433
    :return: the data string with specific trailing punctuation removed
434
    :rtype: str
435
    """
436
    # escape chars: .[]^-
437
    if punctuation:
1✔
438
        punctuation = re.sub(r"([\.\[\]\^\\-])", r"\\\1", punctuation)
1✔
439
    if spaced_punctuation:
1✔
440
        spaced_punctuation = re.sub(r"([\.\[\]\^\\-])", r"\\\1", spaced_punctuation)
1✔
441

442
    return re.sub(
1✔
443
        rf"([{punctuation}]|\s+[{spaced_punctuation}])$", "", data.rstrip()
444
    ).rstrip()
445

446

447
def remove_special_characters(value, chars=["\u0098", "\u009c"]):
1✔
448
    """Remove special characters from a string.
449

450
    :params value: string to clean.
451
    :returns: a cleaned string.
452
    """
453
    for char in chars:
1✔
454
        value = value.replace(char, "")
1✔
455
    return value
1✔
456

457

458
def get_mef_link(bibid, reroid, entity_type, ids, key):
1✔
459
    """Get MEF contribution link.
460

461
    :params bibid: Bib id from the record.
462
    :params reroid: RERO id from the record.
463
    :params entity_type: Entity type.
464
    :params id: $0 from the marc field.
465
    :params key: Tag from the marc field.
466
    :returns: MEF url.
467
    """
468
    from rero_ils.modules.utils import requests_retry_session
1✔
469

470
    # In dojson we dont have app. mef_url should be the same as
471
    # RERO_ILS_MEF_APP_BASE_URL in config.py
472
    # https://mef.rero.ch/api/agents/mef/?q=rero.rero_pid:A012327677
473
    if not ids:
1✔
474
        return
1✔
475
    try:
1✔
476
        # Try to get RERO_ILS_ENTITY_TYPES and RERO_ILS_MEF_CONFIG from current app
477
        # In the dojson cli is no current app and we have to get the value directly
478
        # from config.py
479
        entity_types = current_app.config.get("RERO_ILS_ENTITY_TYPES", {})
1✔
480
        mef_config = current_app.config.get("RERO_ILS_MEF_CONFIG")
1✔
481
    except Exception:
×
482
        from rero_ils.config import RERO_ILS_ENTITY_TYPES as entity_types
×
483
        from rero_ils.config import RERO_ILS_MEF_CONFIG as mef_config
×
484
    entity_type = entity_types.get(entity_type)
1✔
485
    mef_url = mef_config.get(entity_type, {}).get("base_url")
1✔
486
    if not mef_url:
1✔
487
        return
1✔
488
    sources = mef_config.get(entity_type, {}).get("sources")
1✔
489
    has_no_de_101 = True
1✔
490
    for id_ in ids:
1✔
491
        # see if we have a $0 with (DE-101)
492
        if match := re_identified.match(id_):
1✔
493
            with contextlib.suppress(IndexError):
1✔
494
                if match.group(1).lower() == "de-101":
1✔
495
                    has_no_de_101 = False
1✔
496
                    break
1✔
497
    for id_ in ids:
1✔
498
        if type(id_) is str:
1✔
499
            match = re_identified.search(id_)
1✔
500
        else:
501
            match = re_identified.search(id_[0])
×
502
        if match and len(match.groups()) == 2 and key[:3] in _CONTRIBUTION_TAGS:
1✔
503
            match_type = match.group(1).lower()
1✔
504
            match_value = match.group(2)
1✔
505
            if match_type == "de-101":
1✔
506
                match_type = "gnd"
1✔
507
            elif match_type == "de-588" and has_no_de_101:
1✔
508
                match_type = "gnd"
×
509
                match_value = get_gnd_de_101(match_value)
×
510
            if match_type and match_type in sources:
1✔
511
                url = f"{mef_url}/mef/latest/{match_type}:{match_value}"
1✔
512
                response = requests_retry_session().get(url)
1✔
513
                status_code = response.status_code
1✔
514
                total = 0
1✔
515
                if status_code == requests.codes.ok:
1✔
516
                    if value := response.json().get(match_type, {}).get("pid"):
1✔
517
                        if match_value != value:
1✔
518
                            error_print(
×
519
                                f"INFO GET MEF {entity_type}:",
520
                                bibid,
521
                                reroid,
522
                                key,
523
                                id_,
524
                                "NEW",
525
                                f"({match_type.upper()}){value}",
526
                            )
527
                        return f"{mef_url}/{match_type}/{value}"
1✔
528
                error_print(
1✔
529
                    "WARNING GET MEF CONTRIBUTION:",
530
                    bibid,
531
                    reroid,
532
                    key,
533
                    id_,
534
                    url,
535
                    status_code,
536
                    total,
537
                )
538
            # if we have a viaf id, look for the contributor in MEF
539
            elif match_type == "viaf":
1✔
540
                url = f"{mef_url}/mef?q=viaf_pid:{match_value}"
×
541
                response = requests_retry_session().get(url)
×
542
                status_code = response.status_code
×
543
                if status_code == requests.codes.ok:
×
544
                    resp = response.json()
×
545
                    with contextlib.suppress(IndexError, KeyError):
×
546
                        mdata = resp["hits"]["hits"][0]["metadata"]
×
547
                        for source in ["idref", "gnd"]:
×
548
                            if match_value := mdata.get(source, {}).get("pid"):
×
549
                                match_type = source
×
550
                                break
×
551
        elif match:
1✔
552
            error_print("ERROR GET MEF CONTRIBUTION:", bibid, reroid, key, id_)
×
553

554

555
def add_note(new_note, data):
1✔
556
    """Add a new note to the data avoiding duplicate notes.
557

558
    :param new_note: the note object to add
559
    :type new_note: object
560
    :param data: the object data on which the new note will be added
561
    :type data: object
562
    """
563
    if new_note and new_note.get("label") and new_note.get("noteType"):
1✔
564
        notes = data.get("note", [])
1✔
565
        if new_note not in notes:
1✔
566
            notes.append(new_note)
1✔
567
            data["note"] = notes
1✔
568

569

570
def add_data_and_sort_list(key, new_data, data):
1✔
571
    """Add strings to data[keys] list avoiding duplicates (the list is sorted).
572

573
    :param key: the key of object to add
574
    :type key: str
575
    :param new_data: the new_data (list of string) to add to data[key]
576
    :type new_data: list
577
    :param data: the object data on which the new data will be added
578
    :type data: object
579
    """
580
    existing_data = data.get(key, [])
1✔
581
    if new_data:
1✔
582
        new_set = set(existing_data)
1✔
583
        for value_data in new_data:
1✔
584
            new_set.add(value_data)
1✔
585
        data[key] = sorted(list(new_set))
1✔
586

587

588
def join_alternate_graphic_data(alt_gr_1, alt_gr_2, join_str):
1✔
589
    """
590
    Build the alternate graphical data by joining the alt_gr strings.
591

592
    The given join_str id used for joining the strings.
593

594
    :param alt_gr_1: the alternate graphic 1
595
    :type alt_gr_1: array
596
    :param alt_gr_2: the alternate graphic 2
597
    :type alt_gr_12: array
598
    :param join_str: the string used as separator of concatenated strings
599
    :type join_str: str
600
    :return: atl_gr structure with joined strings from alt_gr_1 and alt_gr_2
601
    :rtype: list
602
    """
603
    new_alt_gr_data = []
1✔
604
    for idx, data in enumerate(alt_gr_1):
1✔
605
        new_data = deepcopy(data)
1✔
606
        with contextlib.suppress(Exception):
1✔
607
            if str_to_join := alt_gr_2[idx]["value"]:
1✔
608
                new_data["value"] = join_str.join((new_data["value"], str_to_join))
1✔
609
        new_alt_gr_data.append(new_data)
1✔
610
    return new_alt_gr_data
1✔
611

612

613
class BookFormatExtraction(object):
1✔
614
    """Extract book formats from a marc subfield data.
615

616
    The regular expression patterns needed to extract book formats are build by
617
    the constructor.
618
    The method 'extract_book_formats_from' is provided for extract book formats
619
    from a given marc subfield data.
620
    """
621

622
    def __init__(self):
1✔
623
        """Constructor method.
624

625
        The regular expression patterns needed to extract book formats are
626
        build by this constructor.
627
        """
628
        self._format_values = (1, 2, 4, 8, 12, 16, 18, 24, 32, 36, 48, 64, 72, 96, 128)
1✔
629
        self._book_format_code_and_regexp = {}
1✔
630
        self._specific_for_1248 = {
1✔
631
            1: "plano",
632
            2: r"fol[i.\s°⁰)]|fol",
633
            4: "quarto",
634
            8: "octavo",
635
        }
636

637
        def _buid_regexp(value):
1✔
638
            """Build regular expression pattern for the given value.
639

640
            :param value: format (1,2,4,8,12,16,18,24,32,36,48,64,72,96,128)
641
            :type value: int
642
            :return: an expression pattern according to the given value
643
            :rtype: str
644
            """
645
            # generic regexp valid for all values
646
            regexp = rf"(^|[^\d]){value}\s?[°⁰º]|in(-|-gr\.)*\s*{value}($|[^\d])"
1✔
647
            # add specific value regexp
648
            if value in self._specific_for_1248:
1✔
649
                regexp = "|".join([regexp, self._specific_for_1248[value]])
1✔
650
            else:
651
                additional = rf"[^\d]{value}mo|^{value}mo"
1✔
652
                regexp = "|".join([regexp, additional])
1✔
653
            return f"({regexp})"
1✔
654

655
        def _populate_regexp():
1✔
656
            """Populate all the expression patterns."""
657
            for value in self._format_values:
1✔
658
                self._book_format_code_and_regexp[value] = {}
1✔
659
                format_code = "in-plano"
1✔
660
                if value > 1:
1✔
661
                    # {value}ᵒ (U+1d52 MODIFIER LETTER SMALL O)
662
                    format_code = f"{value}ᵒ"
1✔
663
                self._book_format_code_and_regexp[value]["code"] = format_code
1✔
664
                self._book_format_code_and_regexp[value]["regexp"] = re.compile(
1✔
665
                    _buid_regexp(value), re.IGNORECASE
666
                )
667

668
        _populate_regexp()
1✔
669

670
    def extract_book_formats_from(self, subfield_data):
1✔
671
        """Extract book formats from a marc subfield data.
672

673
        :param subfield_data: marc subfield data source for format extraction
674
        :type subfield_data: str
675
        :return: a list of book formats
676
        :rtype: list
677
        """
678
        book_formats = []
1✔
679
        for value in self._format_values:
1✔
680
            regexp = self._book_format_code_and_regexp[value]["regexp"]
1✔
681
            if regexp.search(subfield_data):
1✔
682
                book_formats.append(self._book_format_code_and_regexp[value]["code"])
1✔
683
        return book_formats
1✔
684

685

686
class ReroIlsOverdo(Overdo):
1✔
687
    """Specialized Overdo.
688

689
    The purpose of this class is to store the blob record in order to
690
    have access to all the record fields during the Overdo processing.
691
    This class provide also record field manipulation functions.
692
    """
693

694
    _blob_record = None
1✔
695
    leader = None
1✔
696
    record_type = ""  # LDR 06
1✔
697
    bib_level = "?"  # LDR 07
1✔
698
    extract_description_subfield = None
1✔
699
    extract_series_statement_subfield = None
1✔
700

701
    def __init__(self, bases=None, entry_point_group=None):
1✔
702
        """Reroilsoverdo init."""
703
        super().__init__(bases=bases, entry_point_group=entry_point_group)
1✔
704

705
    def do(self, blob, ignore_missing=True, exception_handlers=None):
1✔
706
        """Translate blob values and instantiate new model instance."""
707
        self._blob_record = blob
1✔
708
        self.leader = blob.get("leader", "")
1✔
709
        if self.leader:
1✔
710
            self.record_type = self.leader[6]  # LDR 06
1✔
711
            self.bib_level = self.leader[7]  # LDR 07
1✔
712

713
        result = super().do(
1✔
714
            blob, ignore_missing=ignore_missing, exception_handlers=exception_handlers
715
        )
716
        if not result.get("provisionActivity"):
1✔
717
            self.default_provision_activity(result)
1✔
718
            error_print(
1✔
719
                "WARNING PROVISION ACTIVITY SET TO DEFAULT:", self.bib_id, self.rero_id
720
            )
721

722
        return result
1✔
723

724
    def build_place(self):
1✔
725
        """Build place data for provisionActivity."""
726
        place = {}
1✔
727
        if self.cantons:
1✔
728
            place["canton"] = self.cantons[0]
1✔
729
        if self.country:
1✔
730
            place["country"] = self.country
1✔
731
        if self.links_from_752:
1✔
732
            place["identifiedBy"] = self.links_from_752[0]
1✔
733
        return place
1✔
734

735
    def default_provision_activity(self, result):
1✔
736
        """Create default provisionActivity."""
737
        places = []
1✔
738
        publication = {"type": "bf:Publication"}
1✔
739
        if place := self.build_place():
1✔
740
            places.append(place)
1✔
741
        # parce le link skipping the fist (already used by build_place)
742
        for i in range(1, len(self.links_from_752)):
1✔
743
            place = {"country": "xx", "identifiedBy": self.links_from_752[i]}
×
744
            places.append(place)
×
745

746
        if places:
1✔
747
            publication["place"] = places
1✔
748
        result["provisionActivity"] = [publication]
1✔
749

750
        if self.date_type_from_008 in ["q", "n"]:
1✔
751
            result["provisionActivity"][0]["note"] = "Date(s) uncertain or unknown"
×
752
        start_date = make_year(self.date1_from_008)
1✔
753
        if not start_date or start_date > 2050:
1✔
754
            error_print(
1✔
755
                "WARNING START DATE 008:",
756
                self.bib_id,
757
                self.rero_id,
758
                f'"{self.date1_from_008}"',
759
            )
760
            start_date = 2050
1✔
761
            result["provisionActivity"][0][
1✔
762
                "note"
763
            ] = "Date not available and automatically set to 2050"
764
        result["provisionActivity"][0]["startDate"] = start_date
1✔
765
        if end_date := make_year(self.date2_from_008):
1✔
766
            if end_date > 2050:
1✔
767
                error_print(
×
768
                    "WARNING END DATE 008:",
769
                    self.bib_id,
770
                    self.rero_id,
771
                    f'"{self.date1_from_008}"',
772
                )
773
            else:
774
                result["provisionActivity"][0]["endDate"] = end_date
1✔
775
        if original_date := make_year(self.original_date_from_008):
1✔
776
            if original_date > 2050:
1✔
777
                error_print(
×
778
                    "WARNING ORIGINAL DATE 008:",
779
                    self.bib_id,
780
                    self.rero_id,
781
                    f'"{self.original_date_from_008}"',
782
                )
783
            else:
784
                result["provisionActivity"][0]["original_date"] = original_date
1✔
785

786
    def get_fields(self, tag=None):
1✔
787
        """Get all fields having the given tag value."""
788
        fields = []
1✔
789
        items = get_field_items(self._blob_record)
1✔
790
        for blob_key, blob_value in items:
1✔
791
            tag_value = blob_key[:3]
1✔
792
            if (tag_value == tag) or not tag:
1✔
793
                field_data = {"tag": tag_value}
1✔
794
                if len(blob_key) == 3:  # if control field
1✔
795
                    field_data["data"] = blob_value.rstrip()
1✔
796
                else:
797
                    field_data["ind1"] = blob_key[3:4]
1✔
798
                    field_data["ind2"] = blob_key[4:5]
1✔
799
                    field_data["subfields"] = blob_value
1✔
800
                fields.append(field_data)
1✔
801
        return fields
1✔
802

803
    def get_control_field_data(self, field):
1✔
804
        """Get control fields data."""
805
        field_data = None
1✔
806
        if int(field["tag"]) < 10:
1✔
807
            field_data = field["data"]
1✔
808
        else:
809
            raise ValueError("control field expected (tag < 01x)")
×
810
        return field_data
1✔
811

812
    def get_subfields(self, field, code=None):
1✔
813
        """Get all subfields having the given subfield code value."""
814
        if int(field["tag"]) < 10:
1✔
815
            raise ValueError("data field expected (tag >= 01x)")
×
816
        items = get_field_items(field.get("subfields", {}))
1✔
817
        return [
1✔
818
            subfield_data
819
            for subfield_code, subfield_data in items
820
            if (subfield_code == code) or not code
821
        ]
822

823
    def build_value_with_alternate_graphic(
1✔
824
        self, tag, code, label, index, link, punct=None, spaced_punct=None
825
    ):
826
        """
827
        Build the data structure for alternate graphical representation.
828

829
        :param tag: the marc field tag
830
        :param code: contains the subfield code. Used for debug only
831
        :param label: the subfield data value
832
        :param index: the subfield index position in the field
833
        :param link: the link code to the alternate graphic field 880
834
        :param punct: punctuation chars to remove i.e. ',.'
835
        :param spaced_punct: punctuation chars preceded by a space to remove
836
        :return: a list of 1 value, or 2 values if alternate graphical exists
837
        :rtype: list
838

839
        Example of return value:
840
        [
841
            {
842
                "value": "B.I. Bursov"
843
            },
844
            {
845
                "value": "Б.И. Бурсов",
846
                "language": "rus-cyrl"
847
            }
848
        ]
849
        """
850

851
        def clean_punctuation(value, punct, spaced_punct):
1✔
852
            return remove_trailing_punctuation(
1✔
853
                value, punctuation=punct, spaced_punctuation=spaced_punct
854
            )
855

856
        # build_value_with_alternate_graphic starts here
857

858
        data = []
1✔
859
        value = clean_punctuation(label, punct, spaced_punct).strip()
1✔
860
        if value:
1✔
861
            value = remove_special_characters(value)
1✔
862
            data = [{"value": value}]
1✔
863
        else:
864
            error_print(
×
865
                "WARNING NO VALUE:", self.bib_id, self.rero_id, tag, code, label
866
            )
867
        with contextlib.suppress(Exception):
1✔
868
            alt_gr = self.alternate_graphic[tag][link]
1✔
869
            subfield = self.get_subfields(alt_gr["field"])[index]
1✔
870
            value = clean_punctuation(subfield, punct, spaced_punct)
1✔
871
            if value:
1✔
872
                data.append(
1✔
873
                    {
874
                        "value": value,
875
                        "language": self.get_language_script(alt_gr["script"]),
876
                    }
877
                )
878
        return data or None
1✔
879

880
    def extract_description_from_marc_field(self, key, value, data):
1✔
881
        """Extract the physical descriptions data from marc field data.
882

883
        This function automatically selects the subfield codes according to
884
        the Marc21 or Unimarc format. The extracted data are:
885
        - productionMethod
886
        - extent
887
        - bookFormat
888
        - dimensions
889
        - physical_detail
890
        - colorContent
891
        - duration
892
        - illustrativeContent
893
        - otherPhysicalDetails and accompanyingMaterial note
894

895
        :param key: the field tag and indicators
896
        :type key: str
897
        :param value: the subfields data
898
        :type value: object
899
        :param data: the object data on which the extracted data will be added
900
        :type data: object
901
        """
902
        # extract production_method from extent and physical_details
903
        extent_and_physical_detail_data = []
1✔
904
        extent = []
1✔
905
        physical_details = []
1✔
906
        physical_details_str = ""
1✔
907
        if value.get("a"):
1✔
908
            extent = utils.force_list(value.get("a", []))[0]
1✔
909
            extent_and_physical_detail_data.append(extent)
1✔
910
            data["extent"] = remove_trailing_punctuation(
1✔
911
                data=extent, punctuation=":;", spaced_punctuation=":;"
912
            )
913
            if not data["extent"]:
1✔
914
                data.pop("extent")
×
915
            # extract the duration
916
            circa_env = r"\s*(ca\.?|env\.?)?\s*\d+"
1✔
917
            hour_min = r"(h|St(d|\.|u)|[mM]in)"
1✔
918
            regexp = re.compile(
1✔
919
                rf"(\((\[?{circa_env}\]?\s*{hour_min}.*?)\))|"
920
                rf"(\[({circa_env}\s*{hour_min}.*?)\])",
921
                re.IGNORECASE,
922
            )
923
            match = regexp.search(extent)
1✔
924
            if match and match.group(1):
1✔
925
                duration = match.group(1).strip("()")
1✔
926
                add_data_and_sort_list("duration", [duration], data)
1✔
927

928
        subfield_code = self.extract_description_subfield["physical_detail"]
1✔
929
        for physical_detail in utils.force_list(value.get(subfield_code, [])):
1✔
930
            physical_detail = remove_trailing_punctuation(physical_detail, ":;", ":;")
1✔
931
            physical_details.append(physical_detail)
1✔
932
            extent_and_physical_detail_data.append(physical_detail)
1✔
933
            # to avoid empty note after removing punctuation
934
            if physical_detail:
1✔
935
                add_note(
1✔
936
                    dict(noteType="otherPhysicalDetails", label=physical_detail), data
937
                )
938

939
        physical_details_str = "|".join(physical_details)
1✔
940
        extent_and_physical_detail_str = "|".join(extent_and_physical_detail_data)
1✔
941

942
        color_content_set = set()
1✔
943
        for key in _COLOR_CONTENT_REGEXP:
1✔
944
            regexp = _COLOR_CONTENT_REGEXP[key]
1✔
945
            if regexp.search(physical_details_str):
1✔
946
                color_content_set.add(key)
1✔
947
        add_data_and_sort_list("colorContent", color_content_set, data)
1✔
948

949
        production_method_set = set()
1✔
950
        for key in _PRODUCTION_METHOD_FROM_EXTENT_AND_PHYSICAL_DETAILS:
1✔
951
            regexp = _PRODUCTION_METHOD_FROM_EXTENT_AND_PHYSICAL_DETAILS[key]
1✔
952
            if regexp.search(extent_and_physical_detail_str):
1✔
953
                production_method_set.add(key)
1✔
954

955
        # extract build illustrativeContent data
956
        # remove 'couv. ill' and the extra '|' resulting of the remove
957
        physical_detail_ill_str = re.sub(r"couv\. ill", "", physical_details_str)
1✔
958
        physical_detail_ill_str = re.sub(r"\|\||^\||\|$", "", physical_detail_ill_str)
1✔
959

960
        illustration_set = set()
1✔
961
        for key in _ILLUSTRATIVE_CONTENT_REGEXP:
1✔
962
            regexp = _ILLUSTRATIVE_CONTENT_REGEXP[key]
1✔
963
            if regexp.search(physical_detail_ill_str):
1✔
964
                illustration_set.add(key)
1✔
965
        add_data_and_sort_list("illustrativeContent", illustration_set, data)
1✔
966

967
        # remove 'rdapm:1005' if specific production_method exists
968
        if ("rdapm:1005") in production_method_set:
1✔
969
            del_set = set(("rdapm:1009", "rdapm:1012", "rdapm:1014", "rdapm:1016"))
1✔
970
            if production_method_set.intersection(del_set):
1✔
971
                production_method_set.remove("rdapm:1005")
1✔
972

973
        # extract production_method from physical_details only
974
        if re.search(
1✔
975
            r"impr|druck|print|offset|s[ée]riegr", physical_details_str, re.IGNORECASE
976
        ):
977
            production_method_set.add("rdapm:1010")
×
978

979
        # build productionMethod data
980
        add_data_and_sort_list("productionMethod", production_method_set, data)
1✔
981

982
        # extract book_format from $c
983
        book_formats = []
1✔
984
        tool = BookFormatExtraction()
1✔
985
        subfield_code = self.extract_description_subfield["book_format"]
1✔
986
        for dimension in utils.force_list(value.get(subfield_code, [])):
1✔
987
            formats = tool.extract_book_formats_from(dimension)
1✔
988
            for book_format in formats:
1✔
989
                book_formats.append(book_format)
1✔
990
            dim = remove_trailing_punctuation(
1✔
991
                data=dimension.rstrip(), punctuation="+,:;&."
992
            )
993
            if dim:
1✔
994
                add_data_and_sort_list("dimensions", utils.force_list(dim), data)
1✔
995
        add_data_and_sort_list("bookFormat", book_formats, data)
1✔
996

997
        # extract accompanyingMaterial note from $e
998
        if value.get("e"):
1✔
999
            material_notes = []
1✔
1000
            if isinstance(self, ReroIlsMarc21Overdo):
1✔
1001
                material_note = utils.force_list(value.get("e", []))[0]
1✔
1002
                material_notes = material_note.split("+")
1✔
1003
            elif isinstance(self, ReroIlsUnimarcOverdo):
1✔
1004
                material_notes = utils.force_list(value.get("e", []))
1✔
1005
            for material_note in material_notes:
1✔
1006
                if material_note:
1✔
1007
                    add_note(
1✔
1008
                        dict(
1009
                            noteType="accompanyingMaterial", label=material_note.strip()
1010
                        ),
1011
                        data,
1012
                    )
1013

1014
    def extract_series_statement_from_marc_field(self, key, value, data):
1✔
1015
        """Extract the seriesStatement data from marc field data.
1016

1017
        This function automatically selects the subfield codes according field
1018
        tag in the Marc21 or Unimarc format. The extracted data are:
1019
        - seriesTitle
1020
        - seriesEnumeration
1021

1022
        :param key: the field tag and indicators
1023
        :type key: str
1024
        :param value: the subfields data
1025
        :type value: object
1026
        :param data: the object data on which the extracted data will be added
1027
        :type data: object
1028
        """
1029
        # extract production_method from extent and physical_details
1030
        tag_link, link = get_field_link_data(value)
1✔
1031
        items = get_field_items(value)
1✔
1032
        index = 1
1✔
1033
        series = {}
1✔
1034
        subseries = []
1✔
1035
        count = 0
1✔
1036
        tag = key[:3]
1✔
1037
        series_title_subfield_code = self.extract_series_statement_subfield[tag][
1✔
1038
            "series_title"
1039
        ]
1040
        series_enumeration_subfield_code = self.extract_series_statement_subfield[tag][
1✔
1041
            "series_enumeration"
1042
        ]
1043
        subfield_selection = {
1✔
1044
            series_title_subfield_code,
1045
            series_enumeration_subfield_code,
1046
        }
1047
        subfield_visited = ""
1✔
1048
        for blob_key, blob_value in items:
1✔
1049
            if blob_key in subfield_selection:
1✔
1050
                subfield_visited += blob_key
1✔
1051
                value_data = self.build_value_with_alternate_graphic(
1✔
1052
                    tag, blob_key, blob_value, index, link, ",.", ":;/-="
1053
                )
1054
                if blob_key == series_title_subfield_code:
1✔
1055
                    count += 1
1✔
1056
                    if count == 1:
1✔
1057
                        series["seriesTitle"] = value_data
1✔
1058
                    else:
1059
                        subseries.append({"subseriesTitle": value_data})
1✔
1060
                elif blob_key == series_enumeration_subfield_code:
1✔
1061
                    if count == 1:
1✔
1062
                        if "seriesEnumeration" in series:
1✔
1063
                            series["seriesEnumeration"] = join_alternate_graphic_data(
1✔
1064
                                alt_gr_1=series["seriesEnumeration"],
1065
                                alt_gr_2=value_data,
1066
                                join_str=", ",
1067
                            )
1068
                        else:
1069
                            series["seriesEnumeration"] = value_data
1✔
1070
                    elif count > 1:
1✔
1071
                        if "subseriesEnumeration" in subseries[count - 2]:
1✔
1072
                            alt_gr_1 = subseries[count - 2]["subseriesEnumeration"]
1✔
1073
                            subseries[count - 2]["subseriesEnumeration"] = (
1✔
1074
                                join_alternate_graphic_data(
1075
                                    alt_gr_1=alt_gr_1,
1076
                                    alt_gr_2=value_data,
1077
                                    join_str=", ",
1078
                                )
1079
                            )
1080
                        else:
1081
                            subseries[count - 2]["subseriesEnumeration"] = value_data
1✔
1082
            if blob_key != "__order__":
1✔
1083
                index += 1
1✔
1084

1085
        error_msg = ""
1✔
1086
        regexp = re.compile(rf"^[^{series_title_subfield_code}]")
1✔
1087
        if regexp.search(subfield_visited):
1✔
1088
            error_msg = (
1✔
1089
                f"missing leading subfield ${series_title_subfield_code} "
1090
                f"in field {tag}"
1091
            )
1092
            error_print("ERROR BAD FIELD FORMAT:", self.bib_id, self.rero_id, error_msg)
1✔
1093
        else:
1094
            if subseries:
1✔
1095
                series["subseriesStatement"] = subseries
1✔
1096
            series_statement = data.get("seriesStatement", [])
1✔
1097
            if series:
1✔
1098
                series_statement.append(series)
1✔
1099
                data["seriesStatement"] = series_statement
1✔
1100

1101

1102
class ReroIlsMarc21Overdo(ReroIlsOverdo):
1✔
1103
    """Specialized Overdo for Marc21.
1104

1105
    This class adds RERO Marc21 properties and functions to the ReroIlsOverdo.
1106
    """
1107

1108
    bib_id = ""
1✔
1109
    field_008_data = ""
1✔
1110
    lang_from_008 = None
1✔
1111
    date1_from_008 = None
1✔
1112
    date2_from_008 = None
1✔
1113
    original_date_from_008 = None
1✔
1114
    date = {"start_date"}
1✔
1115
    date_type_from_008 = ""
1✔
1116
    serial_type = ""  # 008 pos 21
1✔
1117
    langs_from_041_a = []
1✔
1118
    langs_from_041_h = []
1✔
1119
    alternate_graphic = {}
1✔
1120
    is_top_level_record = False  # has 019 $a Niveau supérieur
1✔
1121
    has_field_490 = False
1✔
1122
    has_field_580 = False
1✔
1123
    content_media_carrier_type = None
1✔
1124
    links_from_752 = []
1✔
1125

1126
    def __init__(self, bases=None, entry_point_group=None):
1✔
1127
        """Reroilsmarc21overdo init."""
1128
        super().__init__(bases=bases, entry_point_group=entry_point_group)
1✔
1129
        self.count = 0
1✔
1130
        self.extract_description_subfield = {"physical_detail": "b", "book_format": "c"}
1✔
1131
        self.extract_series_statement_subfield = {
1✔
1132
            "490": {"series_title": "a", "series_enumeration": "v"},
1133
            "773": {"series_title": "t", "series_enumeration": "g"},
1134
            "800": {"series_title": "t", "series_enumeration": "v"},
1135
            "830": {"series_title": "a", "series_enumeration": "v"},
1136
        }
1137

1138
    def do(self, blob, ignore_missing=True, exception_handlers=None):
1✔
1139
        """Translate blob values and instantiate new model instance."""
1140
        self.count += 1
1✔
1141
        result = None
1✔
1142
        try:
1✔
1143
            # extract record leader
1144
            self._blob_record = blob
1✔
1145
            self.leader = blob.get("leader", "")
1✔
1146
            try:
1✔
1147
                self.bib_id = self.get_fields(tag="001")[0]["data"]
1✔
1148
            except Exception:
1✔
1149
                self.bib_id = "???"
1✔
1150
            try:
1✔
1151
                fields_035 = self.get_fields(tag="035")
1✔
1152
                self.rero_id = self.get_subfields(fields_035[0], "a")[0]
1✔
1153
            except Exception:
1✔
1154
                self.rero_id = "???"
1✔
1155
            self.field_008_data = ""
1✔
1156
            self.date1_from_008 = None
1✔
1157
            self.date2_from_008 = None
1✔
1158
            self.original_date_from_008 = None
1✔
1159
            self.date_type_from_008 = ""
1✔
1160
            self.date = {"start_date": None}
1✔
1161
            self.serial_type = ""
1✔
1162
            self.is_top_level_record = False
1✔
1163
            fields_008 = self.get_fields(tag="008")
1✔
1164
            if fields_008:
1✔
1165
                self.field_008_data = self.get_control_field_data(
1✔
1166
                    fields_008[0]
1167
                ).rstrip()
1168
                try:
1✔
1169
                    self.serial_type = self.field_008_data[21]
1✔
1170
                except Exception as err:
×
1171
                    error_print("ERROR SERIAL TYPE:", self.bib_id, self.rero_id, err)
×
1172
                self.date1_from_008 = self.field_008_data[7:11]
1✔
1173
                self.date2_from_008 = self.field_008_data[11:15]
1✔
1174
                self.date_type_from_008 = self.field_008_data[6]
1✔
1175
                if self.date_type_from_008 == "r":
1✔
1176
                    self.original_date_from_008 = self.date2_from_008
1✔
1177
            self.admin_meta_data = {}
1✔
1178

1179
            enc_level = ""
1✔
1180
            if self.leader:
1✔
1181
                enc_level = self.leader[17]  # LDR 17
1✔
1182
            if enc_level in _ENCODING_LEVEL_MAPPING:
1✔
1183
                encoding_level = _ENCODING_LEVEL_MAPPING[enc_level]
1✔
1184
            else:
1185
                encoding_level = _ENCODING_LEVEL_MAPPING["u"]
1✔
1186
            self.admin_meta_data["encodingLevel"] = encoding_level
1✔
1187

1188
            self.init_lang()
1✔
1189
            self.init_country()
1✔
1190
            self.init_alternate_graphic()
1✔
1191
            self.init_date()
1✔
1192
            self.init_content_media_carrier_type()
1✔
1193

1194
            # get notes from 019 $a or $b and
1195
            # identifiy a top level record (has 019 $a Niveau supérieur)
1196
            regexp = re.compile(r"Niveau sup[eé]rieur", re.IGNORECASE)
1✔
1197
            fields_019 = self.get_fields(tag="019")
1✔
1198
            notes_from_019_and_351 = []
1✔
1199
            for field_019 in fields_019:
1✔
1200
                note = ""
1✔
1201
                for subfield_a in self.get_subfields(field_019, "a"):
1✔
1202
                    note += " | " + subfield_a
1✔
1203
                    if regexp.search(subfield_a):
1✔
1204
                        self.is_top_level_record = True
1✔
1205
                for subfield_b in self.get_subfields(field_019, "b"):
1✔
1206
                    note += " | " + subfield_b
×
1207
                for subfield_9 in self.get_subfields(field_019, "9"):
1✔
1208
                    note += " (" + subfield_9 + ")"
1✔
1209
                    break
1✔
1210
                if note:
1✔
1211
                    notes_from_019_and_351.append(note[3:])
1✔
1212

1213
            fields_351 = self.get_fields(tag="351")
1✔
1214
            for field_351 in fields_351:
1✔
1215
                note = " | ".join(self.get_subfields(field_351, "c"))
1✔
1216
                if note:
1✔
1217
                    notes_from_019_and_351.append(note)
1✔
1218

1219
            if notes_from_019_and_351:
1✔
1220
                self.admin_meta_data["note"] = notes_from_019_and_351
1✔
1221

1222
            fields_040 = self.get_fields(tag="040")
1✔
1223
            for field_040 in fields_040:
1✔
1224
                for subfield_a in self.get_subfields(field_040, "a"):
1✔
1225
                    self.admin_meta_data["source"] = subfield_a
1✔
1226
                for subfield_b in self.get_subfields(field_040, "b"):
1✔
1227
                    if subfield_b in _LANGUAGES:
1✔
1228
                        self.admin_meta_data["descriptionLanguage"] = subfield_b
1✔
1229
                    else:
1230
                        error_print(
1✔
1231
                            "WARNING NOT A LANGUAGE 040:",
1232
                            self.bib_id,
1233
                            self.rero_id,
1234
                            subfield_b,
1235
                        )
1236
                description_modifier = []
1✔
1237
                for subfield_d in self.get_subfields(field_040, "d"):
1✔
1238
                    description_modifier.append(subfield_d)
1✔
1239
                if description_modifier:
1✔
1240
                    self.admin_meta_data["descriptionModifier"] = description_modifier
1✔
1241
                description_conventions = []
1✔
1242
                for subfield_e in self.get_subfields(field_040, "e"):
1✔
1243
                    description_conventions.append(subfield_e)
1✔
1244
                if description_conventions:
1✔
1245
                    self.admin_meta_data["descriptionConventions"] = (
1✔
1246
                        description_conventions
1247
                    )
1248

1249
            # build the list of links from field 752
1250
            self.links_from_752 = []
1✔
1251
            fields_752 = self.get_fields(tag="752")
1✔
1252
            for field_752 in fields_752:
1✔
1253
                subfields_d = self.get_subfields(field_752, "d")
1✔
1254
                if subfields_d:
1✔
1255
                    identifier = build_identifier(field_752["subfields"])
1✔
1256
                    if identifier:
1✔
1257
                        self.links_from_752.append(identifier)
1✔
1258

1259
            # check presence of specific fields
1260
            self.has_field_490 = len(self.get_fields(tag="490")) > 0
1✔
1261
            self.has_field_580 = len(self.get_fields(tag="580")) > 0
1✔
1262
            result = super().do(
1✔
1263
                blob,
1264
                ignore_missing=ignore_missing,
1265
                exception_handlers=exception_handlers,
1266
            )
1267
        except Exception as err:
×
1268
            error_print(
×
1269
                "ERROR DO:",
1270
                self.bib_id,
1271
                self.rero_id,
1272
                self.count,
1273
                f"{err} {traceback.format_exception_only}",
1274
            )
1275
            traceback.print_exc()
×
1276
            raise Exception(err)
×
1277
        return result
1✔
1278

1279
    def get_link_data(self, subfields_6_data):
1✔
1280
        """Extract link and script data from subfields $6 data."""
1281
        link = None
1✔
1282
        tag, extra_data = subfields_6_data.split("-")
1✔
1283
        if extra_data:
1✔
1284
            link_and_script_data = extra_data.split("/")
1✔
1285
            link = link_and_script_data[0]
1✔
1286
            try:
1✔
1287
                script_code = link_and_script_data[1]
1✔
1288
            except Exception:
1✔
1289
                script_code = "latn"
1✔
1290
            try:
1✔
1291
                script_dir = link_and_script_data[2]
1✔
1292
            except Exception:
1✔
1293
                script_dir = ""
1✔
1294
        return tag, link, script_code, script_dir
1✔
1295

1296
    def init_country(self):
1✔
1297
        """Initialization country (008 and 044)."""
1298
        self.country = None
1✔
1299
        self.cantons = []
1✔
1300
        if fields_044 := self.get_fields(tag="044"):
1✔
1301
            field_044 = fields_044[0]
1✔
1302
            for cantons_code in self.get_subfields(field_044, "c"):
1✔
1303
                try:
1✔
1304
                    if canton := cantons_code.split("-")[1].strip():
1✔
1305
                        if canton in _CANTON:
1✔
1306
                            self.cantons.append(canton)
1✔
1307
                        else:
1308
                            error_print(
1✔
1309
                                "WARNING INIT CANTONS:",
1310
                                self.bib_id,
1311
                                self.rero_id,
1312
                                f'" {cantons_code}"',
1313
                            )
1314
                except Exception:
1✔
1315
                    error_print(
1✔
1316
                        "WARNING INIT CANTONS:",
1317
                        self.bib_id,
1318
                        self.rero_id,
1319
                        f'"{cantons_code}"',
1320
                    )
1321
            if self.cantons:
1✔
1322
                self.country = "sz"
1✔
1323
        # We did not find a country in 044 trying 008.
1324
        if not self.country:
1✔
1325
            with contextlib.suppress(Exception):
1✔
1326
                self.country = self.field_008_data[15:17].rstrip().lower()
1✔
1327
        # Use equivalent if country code is obsolete
1328
        if self.country in _OBSOLETE_COUNTRIES_MAPPING:
1✔
1329
            self.country = _OBSOLETE_COUNTRIES_MAPPING[self.country]
1✔
1330
        # We did not find a country set it to 'xx'
1331
        if self.country not in _COUNTRIES:
1✔
1332
            if self.country not in ["", "||"]:
1✔
1333
                error_print(
1✔
1334
                    "WARNING NOT A COUNTRY:",
1335
                    self.bib_id,
1336
                    self.rero_id,
1337
                    f'"{self.country}"',
1338
                )
1339
            self.country = "xx"
1✔
1340

1341
    def init_lang(self):
1✔
1342
        """Initialization languages (008 and 041)."""
1343

1344
        def init_lang_from(fields_041, code):
1✔
1345
            """Construct list of language codes from data."""
1346
            langs_from_041 = []
1✔
1347
            for field_041 in fields_041:
1✔
1348
                lang_codes = self.get_subfields(field_041, code)
1✔
1349
                for lang_from_041 in lang_codes:
1✔
1350
                    if lang_from_041 not in langs_from_041:
1✔
1351
                        if lang_from_041 in _LANGUAGES:
1✔
1352
                            langs_from_041.append(lang_from_041)
1✔
1353
                        else:
1354
                            error_print(
×
1355
                                "WARNING NOT A LANGUAGE 041:",
1356
                                self.bib_id,
1357
                                self.rero_id,
1358
                                f'${code} "{lang_from_041}"',
1359
                            )
1360
            return langs_from_041
1✔
1361

1362
        self.lang_from_008 = None
1✔
1363
        self.langs_from_041_a = []
1✔
1364
        self.langs_from_041_h = []
1✔
1365
        try:
1✔
1366
            self.lang_from_008 = self.field_008_data[35:38]
1✔
1367
            if self.lang_from_008 not in _LANGUAGES:
1✔
1368
                error_print(
1✔
1369
                    "WARNING NOT A LANGUAGE 008:",
1370
                    self.bib_id,
1371
                    self.rero_id,
1372
                    f'"{self.lang_from_008}"',
1373
                )
1374
                self.lang_from_008 = "und"
1✔
1375
        except Exception:
×
1376
            self.lang_from_008 = "und"
×
1377
            error_print(
×
1378
                "WARNING NOT A LANGUAGE 008:",
1379
                self.bib_id,
1380
                self.rero_id,
1381
                f'"{self.field_008_data}"',
1382
            )
1383

1384
        fields_041 = self.get_fields(tag="041")
1✔
1385
        self.langs_from_041_a = init_lang_from(fields_041, code="a")
1✔
1386
        self.langs_from_041_h = init_lang_from(fields_041, code="h")
1✔
1387

1388
    def init_date(self):
1✔
1389
        """Initialization start and end date.
1390

1391
        1. get dates from 008
1392
        2. get dates from 264 Ind2 1,0,2,4,3 $c
1393
        3. get dates from 773 $g
1394
        4. set start_date to 2050
1395
        """
1396
        if self.date_type_from_008 in ["q", "n"]:
1✔
1397
            self.date["note"] = "Date(s) uncertain or unknown"
1✔
1398
        start_date = make_year(self.date1_from_008)
1✔
1399
        if not (start_date and start_date >= -9999 and start_date <= 2050):
1✔
1400
            start_date = None
1✔
1401
        if not start_date:
1✔
1402
            fields_264 = self.get_fields("264")
1✔
1403
            for ind2 in ["1", "0", "2", "4", "3"]:
1✔
1404
                for field_264 in fields_264:
1✔
1405
                    if ind2 == field_264["ind2"]:
1✔
1406
                        if subfields_c := self.get_subfields(field_264, "c"):
1✔
1407
                            year = re.search(r"(-?\d{1,4})", subfields_c[0])
1✔
1408
                            if year:
1✔
1409
                                year = int(year.group(0))
1✔
1410
                            if year and year <= -9999 and year >= 2050:
1✔
1411
                                start_date = year
×
1412
                                break
×
1413
                else:
1414
                    # Continue if the inner loop wasn't broken.
1415
                    continue
1✔
1416
                # Inner loop was broken, break the outer.
1417
                break
×
1418
        if not start_date:
1✔
1419
            fields_773 = self.get_fields("773")
1✔
1420
            for field_773 in fields_773:
1✔
1421
                if subfields_g := self.get_subfields(field_773, "g"):
1✔
1422
                    year = re.search(r"(-?\d{4})", subfields_g[0])
1✔
1423
                    if year:
1✔
1424
                        year = int(year.group(0))
1✔
1425
                    if year and year <= -9999 and year >= 2050:
1✔
1426
                        start_date = year
×
1427
        if not start_date:
1✔
1428
            start_date = 2050
1✔
1429
            self.date["note"] = "Date not available and automatically set to 2050"
1✔
1430
            error_print(
1✔
1431
                "INFO NO START DATE IN 264, 773, 008:",
1432
                self.bib_id,
1433
                self.rero_id,
1434
            )
1435
        self.date["start_date"] = start_date
1✔
1436

1437
        end_date = make_year(self.date2_from_008)
1✔
1438
        if end_date and end_date >= -9999 and end_date <= 2050:
1✔
1439
            self.date["end_date"] = end_date
1✔
1440

1441
    def init_alternate_graphic(self):
1✔
1442
        """Initialization of alternate graphic representation.
1443

1444
        Parse all the 880 fields and populate a dictionary having as first
1445
        level keys the tag of the linked_data field and as second level key the
1446
        link code (from $6) of the linked_data field. The language script is
1447
        extracted from $6 and used to qualify the alternate graphic value.
1448
        """
1449

1450
        def get_script_from_lang(asian=False):
1✔
1451
            """Initialization of alternate graphic representation."""
1452
            script = None
1✔
1453
            default_script = "zyyy"
1✔
1454
            script_per_lang = _SCRIPT_PER_LANG_NOT_ASIA
1✔
1455
            if asian:
1✔
1456
                default_script = "hani"
1✔
1457
                script_per_lang = _SCRIPT_PER_LANG_ASIA
1✔
1458
            script = script_per_lang.get(self.lang_from_008)
1✔
1459
            if not script:
1✔
1460
                for lang in self.langs_from_041_a:
1✔
1461
                    if lang in script_per_lang:
×
1462
                        script = script_per_lang[lang]
×
1463
                        break
×
1464
                if not script:
1✔
1465
                    script = default_script
1✔
1466
            return script
1✔
1467

1468
        # function init_alternate_graphic start here
1469
        self.alternate_graphic = {}
1✔
1470
        fields_880 = self.get_fields(tag="880")
1✔
1471
        for field_880 in fields_880:
1✔
1472
            try:
1✔
1473
                subfields_6 = self.get_subfields(field_880, code="6")
1✔
1474
                for subfield_6 in subfields_6:
1✔
1475
                    tag, link, script_code, script_dir = self.get_link_data(subfield_6)
1✔
1476
                    tag_data = self.alternate_graphic.get(tag, {})
1✔
1477
                    link_data = tag_data.get(link, {})
1✔
1478
                    if script_code == "$1":
1✔
1479
                        script = get_script_from_lang(asian=True)
1✔
1480
                    elif script_code in _SCRIPT_PER_CODE:
1✔
1481
                        script = _SCRIPT_PER_CODE[script_code]
1✔
1482
                    else:
1483
                        script = get_script_from_lang()
1✔
1484
                    link_data["script"] = script
1✔
1485
                    link_data["field"] = field_880
1✔
1486
                    if script_dir == "r":
1✔
1487
                        link_data["right_to_left"] = True
1✔
1488
                    tag_data[link] = link_data
1✔
1489
                    self.alternate_graphic[tag] = tag_data
1✔
1490
            except Exception as error:
×
1491
                click.secho(f"Error in init_alternate_graphic: {error}", fg="red")
×
1492

1493
    def get_language_script(self, script_code):
1✔
1494
        """Build the `language-script` code.
1495

1496
        This code is built according to the format
1497
        <lang_code>-<script_code> for example: chi-hani;
1498
        the <lang_code> is retrived from field 008 and 041
1499
        the <script_code> is received as parameter
1500

1501
        :param script_code: the script code
1502
        :param script_code: str
1503
        :return: language script code in the format `<lang_code>-<script_code>`
1504
        :rtype: str
1505
        """
1506
        if script_code in _LANGUAGES_SCRIPTS:
1✔
1507
            languages = (
1✔
1508
                [self.lang_from_008] + self.langs_from_041_a + self.langs_from_041_h
1509
            )
1510
            for lang in languages:
1✔
1511
                if lang in _LANGUAGES_SCRIPTS[script_code]:
1✔
1512
                    return "-".join([lang, script_code])
1✔
1513
            error_print(
1✔
1514
                "WARNING LANGUAGE SCRIPTS:",
1515
                self.bib_id,
1516
                self.rero_id,
1517
                script_code,
1518
                "008:",
1519
                f'"{self.lang_from_008}"',
1520
                "041$a:",
1521
                self.langs_from_041_a,
1522
                "041$h:",
1523
                self.langs_from_041_h,
1524
            )
1525
        return "-".join(["und", script_code])
1✔
1526

1527
    def build_variant_title_data(self, string_set):
1✔
1528
        """Build variant title data form fields 246.
1529

1530
        :param string_set: the marc field tag
1531
        :type string_set: set
1532
        :return: a list of variant_title object
1533
        :rtype: list
1534
        """
1535
        variant_list = []
1✔
1536
        fields_246 = self.get_fields(tag="246")
1✔
1537
        for field_246 in fields_246:
1✔
1538
            variant_data = {}
1✔
1539
            subfield_246_a = ""
1✔
1540
            if subfields_246_a := self.get_subfields(field_246, "a"):
1✔
1541
                subfield_246_a = subfields_246_a[0]
1✔
1542
            subfield_246_a_cleaned = remove_trailing_punctuation(
1✔
1543
                subfield_246_a, ",.", ":;/-="
1544
            )
1545
            if subfield_246_a_cleaned not in string_set:
1✔
1546
                # parse all subfields in order
1547
                index = 1
1✔
1548
                items = get_field_items(field_246["subfields"])
1✔
1549
                tag_link, link = get_field_link_data(field_246)
1✔
1550
                part_list = TitlePartList(part_number_code="n", part_name_code="p")
1✔
1551

1552
                subfield_selection = {"a", "n", "p"}
1✔
1553
                for blob_key, blob_value in items:
1✔
1554
                    if blob_key in subfield_selection:
1✔
1555
                        if blob_key == "a":
1✔
1556
                            subfield_a_parts = blob_value.split(":")
1✔
1557
                            part_index = 0
1✔
1558
                            for subfield_a_part in subfield_a_parts:
1✔
1559
                                value_data = self.build_value_with_alternate_graphic(
1✔
1560
                                    "246",
1561
                                    blob_key,
1562
                                    subfield_a_part,
1563
                                    index,
1564
                                    link,
1565
                                    ",.",
1566
                                    ":;/-=",
1567
                                )
1568
                                if value_data:
1✔
1569
                                    if part_index == 0:
1✔
1570
                                        variant_data["type"] = "bf:VariantTitle"
1✔
1571
                                        variant_data["mainTitle"] = value_data
1✔
1572
                                    else:
1573
                                        variant_data["subtitle"] = value_data
1✔
1574
                                    part_index += 1
1✔
1575
                        elif blob_key in ["n", "p"]:
1✔
1576
                            value_data = self.build_value_with_alternate_graphic(
1✔
1577
                                "246",
1578
                                blob_key,
1579
                                blob_value,
1580
                                index,
1581
                                link,
1582
                                ",.",
1583
                                ":;/-=",
1584
                            )
1585
                            if value_data:
1✔
1586
                                part_list.update_part(value_data, blob_key, blob_value)
1✔
1587
                    if blob_key != "__order__":
1✔
1588
                        index += 1
1✔
1589
                if the_part_list := part_list.get_part_list():
1✔
1590
                    variant_data["part"] = the_part_list
1✔
1591
                if variant_data:
1✔
1592
                    variant_list.append(variant_data)
1✔
1593
        return variant_list
1✔
1594

1595
    def init_content_media_carrier_type(self):
1✔
1596
        """Initialization content/media/carrier type (336, 337 and 338)."""
1597
        content_media_carrier_type_per_tag = {
1✔
1598
            "336": "contentType",
1599
            "337": "mediaType",
1600
            "338": "carrierType",
1601
        }
1602
        content_media_carrier_map_per_tag = {
1✔
1603
            "336": _CONTENT_TYPE_MAPPING,
1604
            "337": _MEDIA_TYPE_MAPPING,
1605
            "338": _CARRIER_TYPE_MAPPING,
1606
        }
1607

1608
        content_media_carrier_type = {}
1✔
1609
        media_type_from_unlinked_337 = ""
1✔
1610
        for tag in ["336", "337", "338"]:  # parsing tag in the right order
1✔
1611
            type_key = content_media_carrier_type_per_tag[tag]
1✔
1612
            fields = self.get_fields(tag=tag)
1✔
1613
            for field in fields:
1✔
1614
                subfields_8 = self.get_subfields(field, "8") or ["0"]
1✔
1615
                for subfield_b in self.get_subfields(field, "b"):
1✔
1616
                    type_found = False
1✔
1617
                    for link in subfields_8:
1✔
1618
                        linked_data = content_media_carrier_type.get(link, {})
1✔
1619
                        if tag == "336":
1✔
1620
                            linked_data_type_value = linked_data.get(type_key, [])
1✔
1621
                            type_value = content_media_carrier_map_per_tag[tag].get(
1✔
1622
                                subfield_b, None
1623
                            )
1624
                            if type_value and type_value not in linked_data_type_value:
1✔
1625
                                linked_data_type_value.append(type_value)
1✔
1626
                                linked_data[type_key] = linked_data_type_value
1✔
1627
                                type_found = True
1✔
1628
                        else:
1629
                            if link == "0" and tag == "337":
1✔
1630
                                media_type_from_unlinked_337 = (
1✔
1631
                                    content_media_carrier_map_per_tag[tag].get(
1632
                                        subfield_b, None
1633
                                    )
1634
                                )
1635
                            linked_data_type_value = linked_data.get(type_key, "")
1✔
1636
                            if type_value := content_media_carrier_map_per_tag[tag].get(
1✔
1637
                                subfield_b, None
1638
                            ):
1639
                                linked_data_type_value = type_value
1✔
1640
                                linked_data[type_key] = linked_data_type_value
1✔
1641
                                type_found = True
1✔
1642
                                if tag == "338":
1✔
1643
                                    media_type_from_338 = _MEDIA_TYPE_MAPPING.get(
1✔
1644
                                        subfield_b[0]
1645
                                    )
1646
                                    if media_type_from_338:
1✔
1647
                                        linked_data["mediaTypeFrom338"] = (
1✔
1648
                                            media_type_from_338
1649
                                        )
1650
                        if type_found:
1✔
1651
                            content_media_carrier_type[link] = linked_data
1✔
1652
                    break  # subfield $b in not repetitive
1✔
1653
        self.content_media_carrier_type = []
1✔
1654
        for link, value in content_media_carrier_type.items():
1✔
1655
            media_type = value.get("mediaType", None)
1✔
1656
            media_type_from_338 = value.get("mediaTypeFrom338", None)
1✔
1657
            # set mediaType from 338 if not get it form 337
1658
            if media_type_from_338:
1✔
1659
                if not media_type:
1✔
1660
                    value["mediaType"] = media_type_from_338
1✔
1661
                elif media_type_from_338 != media_type:
1✔
1662
                    value["mediaType"] = media_type_from_338
×
1663
                    error_print(
×
1664
                        "WARNING MEDIA TYPE:", self.bib_id, self.rero_id, media_type
1665
                    )
1666

1667
            if media_type_from_338 and not media_type:
1✔
1668
                value["mediaType"] = media_type_from_338
1✔
1669
            value.pop("mediaTypeFrom338", None)
1✔
1670
            if "contentType" in value:
1✔
1671
                if media_type_from_unlinked_337 and "mediaType" not in value:
1✔
1672
                    value["mediaType"] = media_type_from_unlinked_337
1✔
1673
                self.content_media_carrier_type.append(value)
1✔
1674

1675

1676
class ReroIlsUnimarcOverdo(ReroIlsOverdo):
1✔
1677
    """Specialized Overdo for UNIMARC.
1678

1679
    This class adds UNIMARC properties and functions to the ReroIlsOverdo.
1680
    """
1681

1682
    bib_id = ""
1✔
1683
    rero_id = "unimarc"
1✔
1684
    lang_from_101 = None
1✔
1685
    alternate_graphic = {}
1✔
1686
    serial_type = ""
1✔
1687

1688
    def __init__(self, bases=None, entry_point_group=None):
1✔
1689
        """Constructor."""
1690
        super().__init__(bases=bases, entry_point_group=entry_point_group)
1✔
1691
        self.count = 0
1✔
1692
        self.extract_description_subfield = {
1✔
1693
            "physical_detail": "c",
1694
            "book_format": "d",
1695
        }
1696
        self.extract_series_statement_subfield = {
1✔
1697
            "225": {"series_title": "a", "series_enumeration": "v"}
1698
        }
1699

1700
    def do(self, blob, ignore_missing=True, exception_handlers=None):
1✔
1701
        """Translate blob values and instantiate new model instance."""
1702
        self.count += 1
1✔
1703
        result = None
1✔
1704
        try:
1✔
1705
            self._blob_record = blob
1✔
1706
            try:
1✔
1707
                self.bib_id = self.get_fields(tag="001")[0]["data"]
1✔
1708
            except Exception:
1✔
1709
                self.bib_id = "???"
1✔
1710

1711
            if fields_101 := self.get_fields(tag="101"):
1✔
1712
                field_101_a = self.get_subfields(fields_101[0], "a")
1✔
1713
                field_101_g = self.get_subfields(fields_101[0], "g")
1✔
1714
                if field_101_a:
1✔
1715
                    self.lang_from_101 = field_101_a[0]
1✔
1716
                if field_101_g:
1✔
1717
                    self.lang_from_101 = field_101_g[0]
1✔
1718

1719
            if fields_110 := self.get_fields(tag="110"):
1✔
1720
                field_110_a = self.get_subfields(fields_110[0], "a")
1✔
1721
                if field_110_a and len(field_110_a[0]) > 0:
1✔
1722
                    self.serial_type = field_110_a[0][0]
1✔
1723

1724
            enc_level = self.leader[17] if self.leader else ""
1✔
1725
            encoding_level = (
1✔
1726
                _ENCODING_LEVEL_MAPPING[enc_level]
1727
                if enc_level in _ENCODING_LEVEL_MAPPING
1728
                else _ENCODING_LEVEL_MAPPING["u"]
1729
            )
1730
            self.admin_meta_data = {"encodingLevel": encoding_level}
1✔
1731
            result = super().do(
1✔
1732
                blob,
1733
                ignore_missing=ignore_missing,
1734
                exception_handlers=exception_handlers,
1735
            )
1736
        except Exception as err:
×
1737
            error_print("ERROR:", self.bib_id, self.rero_id, self.count, err)
×
1738
            traceback.print_exc()
×
1739
        return result
1✔
1740

1741
    def default_provision_activity(self, result):
1✔
1742
        """Default provision activity."""
1743

1744
    def get_language_script(self, unimarc_script_code):
1✔
1745
        """Build the `language-script` code.
1746

1747
        This code is built according to the format
1748
        <lang_code>-<script_code> for example: chi-hani;
1749
        the <lang_code> is retrived from field 101
1750
        the <script_code> is build from the given <unimarc_script_code>
1751

1752
        :param unimarc_script_code: the unimarc script code
1753
        :param unimarc_script_code: str
1754
        :return: language script code in the format `<lang_code>-<script_code>`
1755
        :rtype: str
1756
        """
1757
        if unimarc_script_code in _UNIMARC_LANGUAGES_SCRIPTS:
1✔
1758
            script_code = _UNIMARC_LANGUAGES_SCRIPTS[unimarc_script_code]
1✔
1759
            if script_code in _LANGUAGES_SCRIPTS:
1✔
1760
                lang = self.lang_from_101
1✔
1761
                if lang in _LANGUAGES_SCRIPTS[script_code]:
1✔
1762
                    return "-".join([self.lang_from_101, script_code])
1✔
1763
                error_print(
1✔
1764
                    "WARNING LANGUAGE SCRIPTS:",
1765
                    self.bib_id,
1766
                    self.rero_id,
1767
                    script_code,
1768
                    "101 $a or $g:",
1769
                    f'"{self.lang_from_101}"',
1770
                )
1771
        return "-".join(["und", script_code])
1✔
1772

1773
    def get_alt_graphic_fields(self, tag=None):
1✔
1774
        """Get all alternate graphic fields having the given tag value.
1775

1776
        :param unimarc_script_code: the unimarc script code
1777
        :type unimarc_script_code: str
1778
        :return: a list of alternate graphic fields
1779
        :rtype: list
1780
        """
1781
        fields = []
1✔
1782
        items = get_field_items(self._blob_record)
1✔
1783
        for blob_key, blob_value in items:
1✔
1784
            field_data = {}
1✔
1785
            tag_value = blob_key[:3]
1✔
1786
            if (tag_value == tag) or not tag:
1✔
1787
                field_data["tag"] = tag_value
1✔
1788
                if len(blob_key) == 3:  # if control field
1✔
1789
                    field_data["data"] = blob_value.rstrip()
×
1790
                else:
1791
                    field_data["ind1"] = blob_key[3:4]
1✔
1792
                    field_data["ind2"] = blob_key[4:5]
1✔
1793
                    field_data["subfields"] = blob_value
1✔
1794
                subfields_6 = self.get_subfields(field_data, "6")
1✔
1795
                subfields_7 = self.get_subfields(field_data, "7")
1✔
1796
                # alternate graphic link code start with 'a'
1797
                if (
1✔
1798
                    subfields_6
1799
                    and subfields_6[0][0] == "a"
1800
                    and subfields_7
1801
                    and subfields_7[0] != "ba"
1802
                ):  # ba=latin
1803
                    tag_data = self.alternate_graphic.get(tag, {})
1✔
1804
                    tag_data[subfields_6[0]] = {}
1✔
1805
                    tag_data[subfields_6[0]]["field"] = field_data
1✔
1806
                    tag_data[subfields_6[0]]["script"] = subfields_7[0]
1✔
1807
                    self.alternate_graphic[tag] = tag_data
1✔
1808
                else:
1809
                    fields.append(field_data)
1✔
1810
        return fields
1✔
1811

1812

1813
class TitlePartList(object):
1✔
1814
    """The purpose of this class is to build the title part list.
1815

1816
    The title part list is build parsing the subfields $n, $p of fields 245
1817
    or 246. Each title part is build for a couple of consecutive $n, $p.
1818
    As some $n or $p can be missing, some part are build only form a $p or $n.
1819

1820
    To build a list of parts, you must create an instance of this class for
1821
    a given field 245 or 246. Then you have to parse the subfields $n and $p
1822
    in the order of appearance in the field (245 or 246) and call the "update"
1823
    method for each of these subfields. At the end of the parsing, the method
1824
    "get_part_list" provides the list of constructed parts.
1825

1826
    :param part_number_code: the specific subfield code
1827
    :type part_number_code: char
1828
    :param part_name_code: the specific subfield code
1829
    :type part_name_code: char
1830
    """
1831

1832
    def __init__(self, part_number_code, part_name_code):
1✔
1833
        """Constructor method."""
1834
        self.part_number_waiting_name = {}
1✔
1835
        self.part_list = []
1✔
1836
        self.part_number_code = part_number_code
1✔
1837
        self.part_name_code = part_name_code
1✔
1838

1839
    def update_part(self, value_data, subfield_code, subfield_data):
1✔
1840
        """Update part data.
1841

1842
        The part object is progressively build with the data collected by
1843
        the succesive calls of the method `update_part`.
1844

1845
        :param subfield_code: specific subfield code for part number or name
1846
        :type subfield_code: char
1847
        :param subfield_data: part number or name depending of `subfield_code`
1848
        :type subfield_data: str
1849
        """
1850

1851
        def remove_last_dot(value):
1✔
1852
            """Removes last dot from value if there are no other dots."""
1853
            if value.count(".") == 1:
1✔
1854
                value = value.rstrip(".")
1✔
1855
            return value
1✔
1856

1857
        value_data = remove_last_dot(value_data)
1✔
1858
        if self.part_number_waiting_name:
1✔
1859
            if subfield_code == self.part_name_code:
1✔
1860
                self.part_list.append(
1✔
1861
                    dict(partNumber=self.part_number_waiting_name, partName=value_data)
1862
                )
1863
                self.part_number_waiting_name = {}
1✔
1864
            else:
1865
                self.part_list.append(dict(partNumber=self.part_number_waiting_name))
1✔
1866
                self.part_number_waiting_name = value_data
1✔
1867
        else:
1868
            if subfield_code == self.part_number_code:
1✔
1869
                self.part_number_waiting_name = value_data
1✔
1870
            else:
1871
                self.part_list.append(dict(partName=value_data))
1✔
1872

1873
    def get_part_list(self):
1✔
1874
        """Get the part list.
1875

1876
        This method return a list of part object build by succesive call of
1877
        the method `update_part`. If a part with only the `part_number` is
1878
        pending, it is appended to the list because it will never receive a
1879
        `part_name`.
1880

1881
        :return: an part list object
1882
        :rtype: list
1883
        """
1884
        if self.part_number_waiting_name:
1✔
1885
            self.part_list.append(dict(partNumber=self.part_number_waiting_name))
1✔
1886
        return self.part_list
1✔
1887

1888

1889
def extract_subtitle_and_parallel_titles_from_field_245_b(
1✔
1890
    parallel_title_data, field_245_a_end_with_equal
1891
):
1892
    """Extracts subtitle and parallel titles from field 245 $b.
1893

1894
    This function retrieves the subtitle and the parallel title list
1895
    from field 245 $b. It also constructs a set containing the parallel title
1896
    and these same title without the initial article.
1897
    This set can be used to process the fields 246 to discard those fields
1898
    that match one of the elements in the set.
1899
    It should be noted that the deletion of the article is achieved simply
1900
    by deleting the first word (max 4 chars) of the parallel title.
1901
    It is not a problem if this word is not a real article because the chain
1902
    produced will not correspond to one of the 246 fields to be discarded.
1903

1904
    :param parallel_title_data: list of parallel_title objects
1905
    :param field_245_a_end_with_equal: boolean flag
1906
    :return: a tuple
1907
        - the main subtitle
1908
        - a list of parallel titles
1909
        - a set of pararalel title strings
1910
    :rtype: tuple
1911

1912
    """
1913

1914
    def remove_leading_article(string, max_article_len=4):
1✔
1915
        """Remove the leading article.
1916

1917
        - Any leading word up to 'max_article_len' chars is considered
1918
          as an article to remove.
1919
        - The separateur is an APOSTROPHE or a SPACE.
1920
        :param string: the string to process
1921
        :param max_article_len: the length of article to remove (default: 4)
1922
        :return: the given string without the first word,
1923
            with the given 'max_article_len' chars.
1924
            An empty string is returned if no leading word is removed.
1925
        """
1926
        last_rest = ""
1✔
1927
        for sep in ("'", " "):
1✔
1928
            first, sep, rest = string.partition(sep)
1✔
1929
            len_rest = len(rest)
1✔
1930
            if (
1✔
1931
                len(first) <= max_article_len
1932
                and len_rest > 0
1933
                and len_rest > len(last_rest)
1934
            ):
1935
                last_rest = rest
1✔
1936
        return last_rest
1✔
1937

1938
    data_std = ""
1✔
1939
    data_lang = ""
1✔
1940
    lang = ""
1✔
1941
    main_subtitle = []
1✔
1942
    parallel_titles = []
1✔
1943
    pararalel_title_string_set = set()
1✔
1944
    for parallel_title_value in parallel_title_data:
1✔
1945
        value = parallel_title_value.get("value", "")
1✔
1946
        lang = parallel_title_value.get("language", "")
1✔
1947
        if lang:
1✔
1948
            data_lang = value
1✔
1949
        else:
1950
            data_std = value
1✔
1951

1952
    data_std_items = data_std.split("=")
1✔
1953
    data_lang_items = []
1✔
1954
    if data_lang:
1✔
1955
        data_lang_items = data_lang.split("=")
1✔
1956
    index = 0
1✔
1957
    out_data_dict = {}
1✔
1958

1959
    for data_std in data_std_items:
1✔
1960
        if index == 0 and not field_245_a_end_with_equal:
1✔
1961
            if data_std.rstrip():
1✔
1962
                main_subtitle.append({"value": data_std.rstrip()})
1✔
1963
                if (
1✔
1964
                    lang
1965
                    and index < len(data_lang_items)
1966
                    and data_lang_items[index].rstrip()
1967
                ):
1968
                    main_subtitle.append(
1✔
1969
                        {"value": data_lang_items[index].rstrip(), "language": lang}
1970
                    )
1971
        else:
1972
            main_title = []
1✔
1973
            subtitle = []
1✔
1974
            data_value = remove_trailing_punctuation(data_std.lstrip(), ",.", ":;/-=")
1✔
1975
            pararalel_title_str, sep, subtitle_str = data_value.partition(":")
1✔
1976
            pararalel_title_str = pararalel_title_str.strip()
1✔
1977
            subtitle_str = subtitle_str.strip()
1✔
1978
            data_lang_value = ""
1✔
1979
            pararalel_title_altgr_str = ""
1✔
1980
            subtitle_altgr_str = ""
1✔
1981
            if pararalel_title_str:
1✔
1982
                out_data_dict = {"type": "bf:ParallelTitle"}
1✔
1983
                main_title.append({"value": pararalel_title_str})
1✔
1984
                if lang:
1✔
1985
                    try:
1✔
1986
                        data_lang_value = remove_trailing_punctuation(
1✔
1987
                            data_lang_items[index].lstrip(), ",.", ":;/-="
1988
                        )
1989
                    except Exception:
×
1990
                        data_lang_value = "[missing data]"
×
1991
                    pararalel_title_altgr_str, sep, subtitle_altgr_str = (
1✔
1992
                        data_lang_value.partition(":")
1993
                    )
1994
                    if pararalel_title_altgr_str:
1✔
1995
                        main_title.append(
1✔
1996
                            {
1997
                                "value": pararalel_title_altgr_str.strip(),
1998
                                "language": lang,
1999
                            }
2000
                        )
2001
                pararalel_title_without_article = remove_leading_article(
1✔
2002
                    pararalel_title_str
2003
                )
2004
                if pararalel_title_without_article:
1✔
2005
                    pararalel_title_string_set.add(pararalel_title_without_article)
1✔
2006
                pararalel_title_string_set.add(pararalel_title_str)
1✔
2007

2008
                if subtitle_str:
1✔
2009
                    subtitle.append({"value": subtitle_str})
1✔
2010
                    if lang and subtitle_altgr_str:
1✔
2011
                        subtitle.append(
1✔
2012
                            {
2013
                                "value": subtitle_altgr_str.strip(),
2014
                                "language": lang,
2015
                            }
2016
                        )
2017
                if main_title:
1✔
2018
                    out_data_dict["mainTitle"] = main_title
1✔
2019
                if subtitle:
1✔
2020
                    out_data_dict["subtitle"] = subtitle
1✔
2021
        index += 1
1✔
2022
        if out_data_dict:
1✔
2023
            parallel_titles.append(out_data_dict)
1✔
2024
    return main_subtitle, parallel_titles, pararalel_title_string_set
1✔
2025

2026

2027
def build_responsibility_data(responsibility_data):
1✔
2028
    """Build the responsibility data form subfield $c of field 245.
2029

2030
    :param responsibility_data: list of responsibility_data
2031
    :return: a list of responsibility
2032
    :rtype: list
2033
    """
2034
    data_std = ""
1✔
2035
    data_lang = ""
1✔
2036
    lang = ""
1✔
2037
    responsibilities = []
1✔
2038
    for responsibility_value in responsibility_data:
1✔
2039
        value = responsibility_value.get("value", "")
1✔
2040
        lang = responsibility_value.get("language", "")
1✔
2041
        if lang:
1✔
2042
            data_lang = value
1✔
2043
        else:
2044
            data_std = value
1✔
2045

2046
    data_std_items = data_std.split(";")
1✔
2047
    data_lang_items = []
1✔
2048
    if data_lang:
1✔
2049
        data_lang_items = data_lang.split(";")
1✔
2050
    index = 0
1✔
2051
    for data_std in data_std_items:
1✔
2052
        out_data = []
1✔
2053
        data_value = remove_trailing_punctuation(data_std.lstrip(), ",.", ":;/-=")
1✔
2054
        if data_value:
1✔
2055
            out_data.append({"value": data_value})
1✔
2056
            if lang:
1✔
2057
                try:
1✔
2058
                    data_lang_value = remove_trailing_punctuation(
1✔
2059
                        data_lang_items[index].lstrip(), ",.", ":;/-="
2060
                    )
2061
                    if not data_lang_value:
1✔
2062
                        raise Exception("missing data")
×
2063
                except Exception:
×
2064
                    data_lang_value = "[missing data]"
×
2065
                out_data.append({"value": data_lang_value, "language": lang})
1✔
2066
            index += 1
1✔
2067
            responsibilities.append(out_data)
1✔
2068
    return responsibilities
1✔
2069

2070

2071
def get_gnd_de_101(de_588):
1✔
2072
    """Get DE-101 from GND DE-588 value.
2073

2074
    GND documentation:
2075
    https://www.dnb.de/DE/Service/Hilfe/Katalog/kataloghilfe.html?nn=587750
2076
    https://services.dnb.de/sru/authorities?version=1.1
2077
        &operation=searchRetrieve
2078
        &query=identifier%3D{DE-588}
2079
        &recordSchema=oai_dc
2080
    :params de_588: DE-588 value
2081
    :returns: DE-101 value
2082
    """
2083
    from rero_ils.modules.utils import requests_retry_session
×
2084

2085
    url = (
×
2086
        "https://services.dnb.de/sru/authorities?version=1.1"
2087
        f"&operation=searchRetrieve&query=identifier%3D{de_588}"
2088
        "&recordSchema=oai_dc"
2089
    )
2090
    try:
×
2091
        response = requests_retry_session().get(url)
×
2092
        if response.status_code == requests.codes.ok:
×
2093
            result = xmltodict.parse(response.text)
×
2094
            with contextlib.suppress(Exception):
×
2095
                return result["searchRetrieveResponse"]["records"]["record"][
×
2096
                    "recordData"
2097
                ]["dc"]["dc:identifier"]["#text"]
2098
    except Exception as err:
×
2099
        current_app.logger.warning(f"get_gnd_de_101 de_588: {de_588} | {err}")
×
2100

2101

2102
def build_identifier(data):
1✔
2103
    """Build identifiedBy for document_identifier-v0.0.1.json from $0.
2104

2105
    :param data: data to build the identifiedBy from.
2106
    :returns: identifiedBy from $0 or None.
2107
    """
2108
    sources_mapping = {
1✔
2109
        "RERO": "RERO",
2110
        "RERO-RAMEAU": "RERO",
2111
        "IDREF": "IdRef",
2112
        "GND": "GND",
2113
        "DE-101": "GND",
2114
    }
2115
    result = {}
1✔
2116
    if datas_0 := utils.force_list(data.get("0")):
1✔
2117
        has_no_de_101 = True
1✔
2118
        for data_0 in datas_0:
1✔
2119
            # see if we have a $0 with (DE-101)
2120
            if match := re_identified.match(data_0):
1✔
2121
                with contextlib.suppress(IndexError):
1✔
2122
                    if match.group(1).upper() == "DE-101":
1✔
UNCOV
2123
                        has_no_de_101 = False
×
UNCOV
2124
                        break
×
2125
        for data_0 in datas_0:
1✔
2126
            if match := re_identified.match(data_0):
1✔
2127
                with contextlib.suppress(IndexError):
1✔
2128
                    result["value"] = match.group(2)
1✔
2129
                    source = match.group(1)
1✔
2130
                    if identifier_type := sources_mapping.get(source.upper()):
1✔
2131
                        result["type"] = identifier_type
1✔
2132
                        return result
1✔
UNCOV
2133
                    elif source.upper() == "DE-588" and has_no_de_101:
×
2134
                        if idn := get_gnd_de_101(match.group(2)):
×
2135
                            result["value"] = idn
×
2136
                            result["type"] = "GND"
×
2137
                            return result
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc