• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

inspirehep / inspire-next / 12566

pending completion
12566

Pull #3460

travis-ci

web-flow
disambiguation: add signature pairs sampling

Since INSPIRE has ~3M curated signatures it would take too much time
to train on all possible pairs, so we sample 1M pairs in such a way
that they are representative of the known clusters structure.

Signed-off-by: Jacopo Notarstefano <jacopo.notarstefano@gmail.com>
Pull Request #3460: disambiguation: add signature pairs sampling

60 of 60 new or added lines in 3 files covered. (100.0%)

7576 of 9664 relevant lines covered (78.39%)

2.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.47
/inspirehep/modules/records/receivers.py
1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of INSPIRE.
4
# Copyright (C) 2014-2017 CERN.
5
#
6
# INSPIRE is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# INSPIRE is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
18
#
19
# In applying this license, CERN does not waive the privileges and immunities
20
# granted to it by virtue of its status as an Intergovernmental Organization
21
# or submit itself to any jurisdiction.
22

23
"""Records receivers."""
5✔
24

25
from __future__ import absolute_import, division, print_function
5✔
26

27
import uuid
5✔
28
from itertools import chain
5✔
29
from unicodedata import normalize
5✔
30

31
import six
5✔
32
from celery import Task
5✔
33
from flask import current_app
5✔
34
from flask_sqlalchemy import models_committed
5✔
35

36
from invenio_indexer.api import RecordIndexer
5✔
37
from invenio_indexer.signals import before_record_index
5✔
38
from invenio_records.api import Record
5✔
39
from invenio_records.models import RecordMetadata
5✔
40
from invenio_records.signals import (
5✔
41
    after_record_update,
42
    before_record_insert,
43
    before_record_update,
44
)
45

46
from inspire_dojson.utils import get_recid_from_ref
5✔
47
from inspire_utils.date import earliest_date
5✔
48
from inspire_utils.helpers import force_list
5✔
49
from inspire_utils.name import generate_name_variations
5✔
50
from inspire_utils.record import get_value
5✔
51
from inspirehep.modules.authors.utils import phonetic_blocks
5✔
52
from inspirehep.modules.orcid.utils import (
5✔
53
    get_push_access_tokens,
54
    get_orcids_for_push,
55
)
56

57

58
def is_hep(record):
5✔
59
    return 'hep.json' in record.get('$schema')
3✔
60

61

62
#
63
# before_record_insert & before_record_update
64
#
65

66
@before_record_insert.connect
5✔
67
@before_record_update.connect
5✔
68
def assign_phonetic_block(sender, record, *args, **kwargs):
69
    """Assign a phonetic block to each signature of a Literature record.
70

71
    Uses the NYSIIS algorithm to compute a phonetic block from each
72
    signature's full name, skipping those that are not recognized
73
    as real names, but logging an error when that happens.
74
    """
75
    if not is_hep(record):
3✔
76
        return
2✔
77

78
    author_names = get_value(record, 'authors.full_name', default=[])
3✔
79

80
    try:
3✔
81
        signature_blocks = phonetic_blocks(author_names)
3✔
82
    except Exception as err:
1✔
83
        current_app.logger.error(
1✔
84
            'Cannot extract phonetic blocks for record %d: %s',
85
            record.get('control_number'), err)
86
        return
1✔
87

88
    for author in record.get('authors', []):
3✔
89
        if author['full_name'] in signature_blocks and signature_blocks[author['full_name']]:
3✔
90
            author['signature_block'] = signature_blocks[author['full_name']]
3✔
91

92

93
@before_record_insert.connect
5✔
94
@before_record_update.connect
5✔
95
def assign_uuid(sender, record, *args, **kwargs):
96
    """Assign a UUID to each signature of a Literature record."""
97
    if not is_hep(record):
3✔
98
        return
2✔
99

100
    authors = record.get('authors', [])
3✔
101

102
    for author in authors:
3✔
103
        if 'uuid' not in author:
3✔
104
            author['uuid'] = str(uuid.uuid4())
3✔
105

106

107
#
108
# after_record_update
109
#
110

111
@after_record_update.connect
5✔
112
def push_to_orcid(sender, record, *args, **kwargs):
113
    """If needed, queue the push of the new changes to ORCID."""
114
    if not is_hep(record) or not current_app.config['FEATURE_FLAG_ENABLE_ORCID_PUSH']:
2✔
115
        return
2✔
116

117
    # Ensure there is a control number. This is not always the case because of broken store_record.
118
    if 'control_number' not in record:
2✔
119
        return
×
120

121
    task_name = current_app.config['ORCID_PUSH_TASK_ENDPOINT']
2✔
122

123
    orcids = get_orcids_for_push(record)
2✔
124
    orcids_and_tokens = get_push_access_tokens(orcids)
2✔
125
    for orcid, access_token in orcids_and_tokens:
2✔
126
        push_to_orcid_task = Task()
1✔
127
        push_to_orcid_task.name = task_name
1✔
128
        push_to_orcid_task.apply_async(
1✔
129
            queue='orcid_push',
130
            kwargs={
131
                'orcid': orcid,
132
                'rec_id': record['control_number'],
133
                'oauth_token': access_token,
134
            },
135
        )
136

137

138
#
139
# models_committed
140
#
141

142
@models_committed.connect
5✔
143
def index_after_commit(sender, changes):
144
    """Index a record in ES after it was committed to the DB.
145

146
    This cannot happen in an ``after_record_commit`` receiver from Invenio-Records
147
    because, despite the name, at that point we are not yet sure whether the record
148
    has been really committed to the DB.
149
    """
150
    indexer = RecordIndexer()
4✔
151

152
    for model_instance, change in changes:
4✔
153
        if isinstance(model_instance, RecordMetadata):
4✔
154
            if change in ('insert', 'update'):
4✔
155
                indexer.index(Record(model_instance.json, model_instance))
2✔
156
            else:
157
                indexer.delete(Record(model_instance.json, model_instance))
4✔
158

159

160
#
161
# before_record_index
162
#
163

164
@before_record_index.connect
5✔
165
def enhance_after_index(sender, json, *args, **kwargs):
166
    """Run all the receivers that enhance the record for ES in the right order.
167

168
    .. note::
169

170
       ``populate_recid_from_ref`` **MUST** come before ``populate_bookautocomplete``
171
       because the latter puts a JSON reference in a completion payload, which
172
       would be expanded to an incorrect ``payload_recid`` by the former.
173

174
    """
175
    populate_recid_from_ref(sender, json, *args, **kwargs)
2✔
176
    populate_bookautocomplete(sender, json, *args, **kwargs)
2✔
177
    populate_abstract_source_suggest(sender, json, *args, **kwargs)
2✔
178
    populate_affiliation_suggest(sender, json, *args, **kwargs)
2✔
179
    populate_author_count(sender, json, *args, **kwargs)
2✔
180
    populate_authors_full_name_unicode_normalized(sender, json, *args, **kwargs)
2✔
181
    populate_earliest_date(sender, json, *args, **kwargs)
2✔
182
    populate_inspire_document_type(sender, json, *args, **kwargs)
2✔
183
    populate_name_variations(sender, json, *args, **kwargs)
2✔
184
    populate_title_suggest(sender, json, *args, **kwargs)
2✔
185

186

187
def populate_bookautocomplete(sender, json, *args, **kwargs):
5✔
188
    """Populate the ```bookautocomplete`` field of Literature records."""
189
    if not is_hep(json):
3✔
190
        return
3✔
191

192
    if 'book' not in json.get('document_type', []):
3✔
193
        return
3✔
194

195
    paths = [
3✔
196
        'imprints.date',
197
        'imprints.publisher',
198
        'isbns.value',
199
    ]
200

201
    authors = force_list(get_value(json, 'authors.full_name', default=[]))
3✔
202
    titles = force_list(get_value(json, 'titles.title', default=[]))
3✔
203

204
    input_values = list(chain.from_iterable(
3✔
205
        force_list(get_value(json, path, default=[])) for path in paths))
206
    input_values.extend(authors)
3✔
207
    input_values.extend(titles)
3✔
208
    input_values = [el for el in input_values if el]
3✔
209

210
    ref = get_value(json, 'self.$ref')
3✔
211

212
    json.update({
3✔
213
        'bookautocomplete': {
214
            'input': input_values,
215
            'payload': {
216
                'authors': authors,
217
                'id': ref,
218
                'title': titles,
219
            },
220
        },
221
    })
222

223

224
def populate_inspire_document_type(sender, json, *args, **kwargs):
5✔
225
    """Populate the ``facet_inspire_doc_type`` field of Literature records."""
226
    if not is_hep(json):
3✔
227
        return
3✔
228

229
    result = []
3✔
230

231
    result.extend(json.get('document_type', []))
3✔
232
    result.extend(json.get('publication_type', []))
3✔
233
    if 'refereed' in json and json['refereed']:
3✔
234
        result.append('peer reviewed')
3✔
235

236
    json['facet_inspire_doc_type'] = result
3✔
237

238

239
def populate_recid_from_ref(sender, json, *args, **kwargs):
5✔
240
    """Extract recids from all JSON reference fields and add them to ES.
241

242
    For every field that has as a value a JSON reference, adds a sibling
243
    after extracting the record identifier. Siblings are named by removing
244
    ``record`` occurrences and appending ``_recid`` without doubling or
245
    prepending underscores to the original name.
246

247
    Example::
248

249
        {'record': {'$ref': 'http://x/y/2}}
250

251
    is transformed to::
252

253
        {
254
            'recid': 2,
255
            'record': {'$ref': 'http://x/y/2},
256
        }
257

258
    For every list of object references adds a new list with the
259
    corresponding recids, whose name is similarly computed.
260

261
    Example::
262

263
        {
264
            'records': [
265
                {'$ref': 'http://x/y/1'},
266
                {'$ref': 'http://x/y/2'},
267
            ],
268
        }
269

270
    is transformed to::
271

272
        {
273
            'recids': [1, 2],
274
            'records': [
275
                {'$ref': 'http://x/y/1'},
276
                {'$ref': 'http://x/y/2'},
277
            ],
278
        }
279

280
    """
281
    list_ref_fields_translations = {
3✔
282
        'deleted_records': 'deleted_recids'
283
    }
284

285
    def _recursive_find_refs(json_root):
3✔
286
        if isinstance(json_root, list):
3✔
287
            items = enumerate(json_root)
3✔
288
        elif isinstance(json_root, dict):
3✔
289
            # Note that items have to be generated before altering the dict.
290
            # In this case, iteritems might break during iteration.
291
            items = json_root.items()
3✔
292
        else:
293
            items = []
2✔
294

295
        for key, value in items:
3✔
296
            if (isinstance(json_root, dict) and isinstance(value, dict) and
3✔
297
                    '$ref' in value):
298
                # Append '_recid' and remove 'record' from the key name.
299
                key_basename = key.replace('record', '').rstrip('_')
3✔
300
                new_key = '{}_recid'.format(key_basename).lstrip('_')
3✔
301
                json_root[new_key] = get_recid_from_ref(value)
3✔
302
            elif (isinstance(json_root, dict) and isinstance(value, list) and
3✔
303
                    key in list_ref_fields_translations):
304
                new_list = [get_recid_from_ref(v) for v in value]
3✔
305
                new_key = list_ref_fields_translations[key]
3✔
306
                json_root[new_key] = new_list
3✔
307
            else:
308
                _recursive_find_refs(value)
3✔
309

310
    _recursive_find_refs(json)
3✔
311

312

313
def populate_abstract_source_suggest(sender, json, *args, **kwargs):
5✔
314
    """Populate the ``abstract_source_suggest`` field in Literature records."""
315
    if not is_hep(json):
3✔
316
        return
3✔
317

318
    abstracts = json.get('abstracts', [])
3✔
319

320
    for abstract in abstracts:
3✔
321
        source = abstract.get('source')
3✔
322
        if source:
3✔
323
            abstract.update({
3✔
324
                'abstract_source_suggest': {
325
                    'input': source,
326
                    'output': source,
327
                },
328
            })
329

330

331
def populate_title_suggest(sender, json, *args, **kwargs):
5✔
332
    """Populate the ``title_suggest`` field of Journals records."""
333
    if 'journals.json' not in json.get('$schema'):
3✔
334
        return
3✔
335

336
    journal_title = get_value(json, 'journal_title.title', default='')
3✔
337
    short_title = json.get('short_title', '')
3✔
338
    title_variants = json.get('title_variants', [])
3✔
339

340
    input_values = []
3✔
341
    input_values.append(journal_title)
3✔
342
    input_values.append(short_title)
3✔
343
    input_values.extend(title_variants)
3✔
344
    input_values = [el for el in input_values if el]
3✔
345

346
    json.update({
3✔
347
        'title_suggest': {
348
            'input': input_values,
349
            'output': short_title if short_title else '',
350
            'payload': {
351
                'full_title': journal_title if journal_title else '',
352
            },
353
        }
354
    })
355

356

357
def populate_affiliation_suggest(sender, json, *args, **kwargs):
5✔
358
    """Populate the ``affiliation_suggest`` field of Institution records."""
359
    if 'institutions.json' not in json.get('$schema'):
3✔
360
        return
3✔
361

362
    ICN = json.get('ICN', [])
3✔
363
    institution_acronyms = get_value(json, 'institution_hierarchy.acronym', default=[])
3✔
364
    institution_names = get_value(json, 'institution_hierarchy.name', default=[])
3✔
365
    legacy_ICN = json.get('legacy_ICN', '')
3✔
366
    name_variants = force_list(get_value(json, 'name_variants.value', default=[]))
3✔
367
    postal_codes = force_list(get_value(json, 'addresses.postal_code', default=[]))
3✔
368

369
    input_values = []
3✔
370
    input_values.extend(ICN)
3✔
371
    input_values.extend(institution_acronyms)
3✔
372
    input_values.extend(institution_names)
3✔
373
    input_values.append(legacy_ICN)
3✔
374
    input_values.extend(name_variants)
3✔
375
    input_values.extend(postal_codes)
3✔
376
    input_values = [el for el in input_values if el]
3✔
377

378
    json.update({
3✔
379
        'affiliation_suggest': {
380
            'input': input_values,
381
            'output': legacy_ICN,
382
            'payload': {
383
                '$ref': get_value(json, 'self.$ref'),
384
                'ICN': ICN,
385
                'institution_acronyms': institution_acronyms,
386
                'institution_names': institution_names,
387
                'legacy_ICN': legacy_ICN,
388
            },
389
        },
390
    })
391

392

393
def populate_earliest_date(sender, json, *args, **kwargs):
5✔
394
    """Populate the ``earliest_date`` field of Literature records."""
395
    if not is_hep(json):
3✔
396
        return
3✔
397

398
    date_paths = [
3✔
399
        'preprint_date',
400
        'thesis_info.date',
401
        'thesis_info.defense_date',
402
        'publication_info.year',
403
        'legacy_creation_date',
404
        'imprints.date',
405
    ]
406

407
    dates = [str(el) for el in chain.from_iterable(
3✔
408
        [force_list(get_value(json, path)) for path in date_paths])]
409

410
    if dates:
3✔
411
        result = earliest_date(dates)
3✔
412
        if result:
3✔
413
            json['earliest_date'] = result
3✔
414

415

416
def populate_name_variations(sender, json, *args, **kwargs):
5✔
417
    """Generate name variations for each signature of a Literature record."""
418
    if not is_hep(json):
2✔
419
        return
2✔
420

421
    authors = json.get('authors', [])
2✔
422

423
    for author in authors:
2✔
424
        full_name = author.get('full_name')
2✔
425
        if full_name:
2✔
426
            bais = [
2✔
427
                el['value'] for el in author.get('ids', [])
428
                if el['schema'] == 'INSPIRE BAI'
429
            ]
430
            name_variations = generate_name_variations(full_name)
2✔
431

432
            author.update({'name_variations': name_variations})
2✔
433
            author.update({'name_suggest': {
2✔
434
                'input': name_variations,
435
                'output': full_name,
436
                'payload': {'bai': bais[0] if bais else None}
437
            }})
438

439

440
def populate_author_count(sender, json, *args, **kwargs):
5✔
441
    """Populate the ``author_count`` field of Literature records."""
442
    if not is_hep(json):
3✔
443
        return
3✔
444

445
    authors = json.get('authors', [])
3✔
446

447
    authors_excluding_supervisors = [
3✔
448
        author for author in authors
449
        if 'supervisor' not in author.get('inspire_roles', [])
450
    ]
451
    json['author_count'] = len(authors_excluding_supervisors)
3✔
452

453

454
def populate_authors_full_name_unicode_normalized(sender, json, *args, **kwargs):
5✔
455
    """Populate the ``authors.full_name_normalized`` field of Literature records."""
456
    if not is_hep(json):
3✔
457
        return
3✔
458

459
    authors = json.get('authors', [])
3✔
460

461
    for index, author in enumerate(authors):
3✔
462
        full_name = six.text_type(author['full_name'])
3✔
463
        json['authors'][index].update({
3✔
464
            'full_name_unicode_normalized': normalize('NFKC', full_name).lower()
465
        })
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc