12566

pending completion

Build # 12566

Build Type

Pull #3460

travis-ci

Committed by

web-flow

Commit Message

disambiguation: add signature pairs sampling

Since INSPIRE has ~3M curated signatures it would take too much time
to train on all possible pairs, so we sample 1M pairs in such a way
that they are representative of the known clusters structure.

Signed-off-by: Jacopo Notarstefano <jacopo.notarstefano@gmail.com>

Pull Request Pull Request #3460: disambiguation: add signature pairs sampling

Run Details

60 of 60 new or added lines in 3 files covered. (100.0%)

7576 of 9664 relevant lines covered (78.39%)

2.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.47

/inspirehep/modules/records/receivers.py

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Records receivers."""

from __future__ import absolute_import, division, print_function

import uuid
from itertools import chain
from unicodedata import normalize

import six
from celery import Task
from flask import current_app
from flask_sqlalchemy import models_committed

from invenio_indexer.api import RecordIndexer
from invenio_indexer.signals import before_record_index
from invenio_records.api import Record
from invenio_records.models import RecordMetadata
from invenio_records.signals import (
    after_record_update,
    before_record_insert,
    before_record_update,
)

from inspire_dojson.utils import get_recid_from_ref
from inspire_utils.date import earliest_date
from inspire_utils.helpers import force_list
from inspire_utils.name import generate_name_variations
from inspire_utils.record import get_value
from inspirehep.modules.authors.utils import phonetic_blocks
from inspirehep.modules.orcid.utils import (
    get_push_access_tokens,
    get_orcids_for_push,
)


def is_hep(record):
    return 'hep.json' in record.get('$schema')


#
# before_record_insert & before_record_update
#

@before_record_insert.connect
@before_record_update.connect
def assign_phonetic_block(sender, record, *args, **kwargs):
    """Assign a phonetic block to each signature of a Literature record.

    Uses the NYSIIS algorithm to compute a phonetic block from each
    signature's full name, skipping those that are not recognized
    as real names, but logging an error when that happens.
    """
    if not is_hep(record):
        return

    author_names = get_value(record, 'authors.full_name', default=[])

    try:
        signature_blocks = phonetic_blocks(author_names)
    except Exception as err:
        current_app.logger.error(
            'Cannot extract phonetic blocks for record %d: %s',
            record.get('control_number'), err)
        return

    for author in record.get('authors', []):
        if author['full_name'] in signature_blocks and signature_blocks[author['full_name']]:
            author['signature_block'] = signature_blocks[author['full_name']]


@before_record_insert.connect
@before_record_update.connect
def assign_uuid(sender, record, *args, **kwargs):
    """Assign a UUID to each signature of a Literature record."""
    if not is_hep(record):
        return

    authors = record.get('authors', [])

    for author in authors:
        if 'uuid' not in author:
            author['uuid'] = str(uuid.uuid4())


#
# after_record_update
#

@after_record_update.connect
def push_to_orcid(sender, record, *args, **kwargs):
    """If needed, queue the push of the new changes to ORCID."""
    if not is_hep(record) or not current_app.config['FEATURE_FLAG_ENABLE_ORCID_PUSH']:
        return

    # Ensure there is a control number. This is not always the case because of broken store_record.
    if 'control_number' not in record:
        return

    task_name = current_app.config['ORCID_PUSH_TASK_ENDPOINT']

    orcids = get_orcids_for_push(record)
    orcids_and_tokens = get_push_access_tokens(orcids)
    for orcid, access_token in orcids_and_tokens:
        push_to_orcid_task = Task()
        push_to_orcid_task.name = task_name
        push_to_orcid_task.apply_async(
            queue='orcid_push',
            kwargs={
                'orcid': orcid,
                'rec_id': record['control_number'],
                'oauth_token': access_token,
            },
        )


#
# models_committed
#

@models_committed.connect
def index_after_commit(sender, changes):
    """Index a record in ES after it was committed to the DB.

    This cannot happen in an ``after_record_commit`` receiver from Invenio-Records
    because, despite the name, at that point we are not yet sure whether the record
    has been really committed to the DB.
    """
    indexer = RecordIndexer()

    for model_instance, change in changes:
        if isinstance(model_instance, RecordMetadata):
            if change in ('insert', 'update'):
                indexer.index(Record(model_instance.json, model_instance))
            else:
                indexer.delete(Record(model_instance.json, model_instance))


#
# before_record_index
#

@before_record_index.connect
def enhance_after_index(sender, json, *args, **kwargs):
    """Run all the receivers that enhance the record for ES in the right order.

    .. note::

       ``populate_recid_from_ref`` **MUST** come before ``populate_bookautocomplete``
       because the latter puts a JSON reference in a completion payload, which
       would be expanded to an incorrect ``payload_recid`` by the former.

    """
    populate_recid_from_ref(sender, json, *args, **kwargs)
    populate_bookautocomplete(sender, json, *args, **kwargs)
    populate_abstract_source_suggest(sender, json, *args, **kwargs)
    populate_affiliation_suggest(sender, json, *args, **kwargs)
    populate_author_count(sender, json, *args, **kwargs)
    populate_authors_full_name_unicode_normalized(sender, json, *args, **kwargs)
    populate_earliest_date(sender, json, *args, **kwargs)
    populate_inspire_document_type(sender, json, *args, **kwargs)
    populate_name_variations(sender, json, *args, **kwargs)
    populate_title_suggest(sender, json, *args, **kwargs)


def populate_bookautocomplete(sender, json, *args, **kwargs):
    """Populate the ```bookautocomplete`` field of Literature records."""
    if not is_hep(json):
        return

    if 'book' not in json.get('document_type', []):
        return

    paths = [
        'imprints.date',
        'imprints.publisher',
        'isbns.value',
    ]

    authors = force_list(get_value(json, 'authors.full_name', default=[]))
    titles = force_list(get_value(json, 'titles.title', default=[]))

    input_values = list(chain.from_iterable(
        force_list(get_value(json, path, default=[])) for path in paths))
    input_values.extend(authors)
    input_values.extend(titles)
    input_values = [el for el in input_values if el]

    ref = get_value(json, 'self.$ref')

    json.update({
        'bookautocomplete': {
            'input': input_values,
            'payload': {
                'authors': authors,
                'id': ref,
                'title': titles,
            },
        },
    })


def populate_inspire_document_type(sender, json, *args, **kwargs):
    """Populate the ``facet_inspire_doc_type`` field of Literature records."""
    if not is_hep(json):
        return

    result = []

    result.extend(json.get('document_type', []))
    result.extend(json.get('publication_type', []))
    if 'refereed' in json and json['refereed']:
        result.append('peer reviewed')

    json['facet_inspire_doc_type'] = result


def populate_recid_from_ref(sender, json, *args, **kwargs):
    """Extract recids from all JSON reference fields and add them to ES.

    For every field that has as a value a JSON reference, adds a sibling
    after extracting the record identifier. Siblings are named by removing
    ``record`` occurrences and appending ``_recid`` without doubling or
    prepending underscores to the original name.

    Example::

        {'record': {'$ref': 'http://x/y/2}}

    is transformed to::

        {
            'recid': 2,
            'record': {'$ref': 'http://x/y/2},
        }

    For every list of object references adds a new list with the
    corresponding recids, whose name is similarly computed.

    Example::

        {
            'records': [
                {'$ref': 'http://x/y/1'},
                {'$ref': 'http://x/y/2'},
            ],
        }

    is transformed to::

        {
            'recids': [1, 2],
            'records': [
                {'$ref': 'http://x/y/1'},
                {'$ref': 'http://x/y/2'},
            ],
        }

    """
    list_ref_fields_translations = {
        'deleted_records': 'deleted_recids'
    }

    def _recursive_find_refs(json_root):
        if isinstance(json_root, list):
            items = enumerate(json_root)
        elif isinstance(json_root, dict):
            # Note that items have to be generated before altering the dict.
            # In this case, iteritems might break during iteration.
            items = json_root.items()
        else:
            items = []

        for key, value in items:
            if (isinstance(json_root, dict) and isinstance(value, dict) and
                    '$ref' in value):
                # Append '_recid' and remove 'record' from the key name.
                key_basename = key.replace('record', '').rstrip('_')
                new_key = '{}_recid'.format(key_basename).lstrip('_')
                json_root[new_key] = get_recid_from_ref(value)
            elif (isinstance(json_root, dict) and isinstance(value, list) and
                    key in list_ref_fields_translations):
                new_list = [get_recid_from_ref(v) for v in value]
                new_key = list_ref_fields_translations[key]
                json_root[new_key] = new_list
            else:
                _recursive_find_refs(value)

    _recursive_find_refs(json)


def populate_abstract_source_suggest(sender, json, *args, **kwargs):
    """Populate the ``abstract_source_suggest`` field in Literature records."""
    if not is_hep(json):
        return

    abstracts = json.get('abstracts', [])

    for abstract in abstracts:
        source = abstract.get('source')
        if source:
            abstract.update({
                'abstract_source_suggest': {
                    'input': source,
                    'output': source,
                },
            })


def populate_title_suggest(sender, json, *args, **kwargs):
    """Populate the ``title_suggest`` field of Journals records."""
    if 'journals.json' not in json.get('$schema'):
        return

    journal_title = get_value(json, 'journal_title.title', default='')
    short_title = json.get('short_title', '')
    title_variants = json.get('title_variants', [])

    input_values = []
    input_values.append(journal_title)
    input_values.append(short_title)
    input_values.extend(title_variants)
    input_values = [el for el in input_values if el]

    json.update({
        'title_suggest': {
            'input': input_values,
            'output': short_title if short_title else '',
            'payload': {
                'full_title': journal_title if journal_title else '',
            },
        }
    })


def populate_affiliation_suggest(sender, json, *args, **kwargs):
    """Populate the ``affiliation_suggest`` field of Institution records."""
    if 'institutions.json' not in json.get('$schema'):
        return

    ICN = json.get('ICN', [])
    institution_acronyms = get_value(json, 'institution_hierarchy.acronym', default=[])
    institution_names = get_value(json, 'institution_hierarchy.name', default=[])
    legacy_ICN = json.get('legacy_ICN', '')
    name_variants = force_list(get_value(json, 'name_variants.value', default=[]))
    postal_codes = force_list(get_value(json, 'addresses.postal_code', default=[]))

    input_values = []
    input_values.extend(ICN)
    input_values.extend(institution_acronyms)
    input_values.extend(institution_names)
    input_values.append(legacy_ICN)
    input_values.extend(name_variants)
    input_values.extend(postal_codes)
    input_values = [el for el in input_values if el]

    json.update({
        'affiliation_suggest': {
            'input': input_values,
            'output': legacy_ICN,
            'payload': {
                '$ref': get_value(json, 'self.$ref'),
                'ICN': ICN,
                'institution_acronyms': institution_acronyms,
                'institution_names': institution_names,
                'legacy_ICN': legacy_ICN,
            },
        },
    })


def populate_earliest_date(sender, json, *args, **kwargs):
    """Populate the ``earliest_date`` field of Literature records."""
    if not is_hep(json):
        return

    date_paths = [
        'preprint_date',
        'thesis_info.date',
        'thesis_info.defense_date',
        'publication_info.year',
        'legacy_creation_date',
        'imprints.date',
    ]

    dates = [str(el) for el in chain.from_iterable(
        [force_list(get_value(json, path)) for path in date_paths])]

    if dates:
        result = earliest_date(dates)
        if result:
            json['earliest_date'] = result


def populate_name_variations(sender, json, *args, **kwargs):
    """Generate name variations for each signature of a Literature record."""
    if not is_hep(json):
        return

    authors = json.get('authors', [])

    for author in authors:
        full_name = author.get('full_name')
        if full_name:
            bais = [
                el['value'] for el in author.get('ids', [])
                if el['schema'] == 'INSPIRE BAI'
            ]
            name_variations = generate_name_variations(full_name)

            author.update({'name_variations': name_variations})
            author.update({'name_suggest': {
                'input': name_variations,
                'output': full_name,
                'payload': {'bai': bais[0] if bais else None}
            }})


def populate_author_count(sender, json, *args, **kwargs):
    """Populate the ``author_count`` field of Literature records."""
    if not is_hep(json):
        return

    authors = json.get('authors', [])

    authors_excluding_supervisors = [
        author for author in authors
        if 'supervisor' not in author.get('inspire_roles', [])
    ]
    json['author_count'] = len(authors_excluding_supervisors)


def populate_authors_full_name_unicode_normalized(sender, json, *args, **kwargs):
    """Populate the ``authors.full_name_normalized`` field of Literature records."""
    if not is_hep(json):
        return

    authors = json.get('authors', [])

    for index, author in enumerate(authors):
        full_name = six.text_type(author['full_name'])
        json['authors'][index].update({
            'full_name_unicode_normalized': normalize('NFKC', full_name).lower()
        })

1	# -- coding: utf-8 --
2	#
3	# This file is part of INSPIRE.
4	# Copyright (C) 2014-2017 CERN.
5	#
6	# INSPIRE is free software: you can redistribute it and/or modify
7	# it under the terms of the GNU General Public License as published by
8	# the Free Software Foundation, either version 3 of the License, or
9	# (at your option) any later version.
10	#
11	# INSPIRE is distributed in the hope that it will be useful,
12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	# GNU General Public License for more details.
15	#
16	# You should have received a copy of the GNU General Public License
17	# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
18	#
19	# In applying this license, CERN does not waive the privileges and immunities
20	# granted to it by virtue of its status as an Intergovernmental Organization
21	# or submit itself to any jurisdiction.
22
23	"""Records receivers."""	5✔
24
25	from __future__ import absolute_import, division, print_function	5✔
26
27	import uuid	5✔
28	from itertools import chain	5✔
29	from unicodedata import normalize	5✔
30
31	import six	5✔
32	from celery import Task	5✔
33	from flask import current_app	5✔
34	from flask_sqlalchemy import models_committed	5✔
35
36	from invenio_indexer.api import RecordIndexer	5✔
37	from invenio_indexer.signals import before_record_index	5✔
38	from invenio_records.api import Record	5✔
39	from invenio_records.models import RecordMetadata	5✔
40	from invenio_records.signals import (	5✔
41	after_record_update,
42	before_record_insert,
43	before_record_update,
44	)
45
46	from inspire_dojson.utils import get_recid_from_ref	5✔
47	from inspire_utils.date import earliest_date	5✔
48	from inspire_utils.helpers import force_list	5✔
49	from inspire_utils.name import generate_name_variations	5✔
50	from inspire_utils.record import get_value	5✔
51	from inspirehep.modules.authors.utils import phonetic_blocks	5✔
52	from inspirehep.modules.orcid.utils import (	5✔
53	get_push_access_tokens,
54	get_orcids_for_push,
55	)
56
57
58	def is_hep(record):	5✔
59	return 'hep.json' in record.get('$schema')	3✔
60
61
62	#
63	# before_record_insert & before_record_update
64	#
65
66	@before_record_insert.connect	5✔
67	@before_record_update.connect	5✔
68	def assign_phonetic_block(sender, record, args, *kwargs):
69	"""Assign a phonetic block to each signature of a Literature record.
70
71	Uses the NYSIIS algorithm to compute a phonetic block from each
72	signature's full name, skipping those that are not recognized
73	as real names, but logging an error when that happens.
74	"""
75	if not is_hep(record):	3✔
76	return	2✔
77
78	author_names = get_value(record, 'authors.full_name', default=[])	3✔
79
80	try:	3✔
81	signature_blocks = phonetic_blocks(author_names)	3✔
82	except Exception as err:	1✔
83	current_app.logger.error(	1✔
84	'Cannot extract phonetic blocks for record %d: %s',
85	record.get('control_number'), err)
86	return	1✔
87
88	for author in record.get('authors', []):	3✔
89	if author['full_name'] in signature_blocks and signature_blocks[author['full_name']]:	3✔
90	author['signature_block'] = signature_blocks[author['full_name']]	3✔
91
92
93	@before_record_insert.connect	5✔
94	@before_record_update.connect	5✔
95	def assign_uuid(sender, record, args, *kwargs):
96	"""Assign a UUID to each signature of a Literature record."""
97	if not is_hep(record):	3✔
98	return	2✔
99
100	authors = record.get('authors', [])	3✔
101
102	for author in authors:	3✔
103	if 'uuid' not in author:	3✔
104	author['uuid'] = str(uuid.uuid4())	3✔
105
106
107	#
108	# after_record_update
109	#
110
111	@after_record_update.connect	5✔
112	def push_to_orcid(sender, record, args, *kwargs):
113	"""If needed, queue the push of the new changes to ORCID."""
114	if not is_hep(record) or not current_app.config['FEATURE_FLAG_ENABLE_ORCID_PUSH']:	2✔
115	return	2✔
116
117	# Ensure there is a control number. This is not always the case because of broken store_record.
118	if 'control_number' not in record:	2✔
119	return	×
120
121	task_name = current_app.config['ORCID_PUSH_TASK_ENDPOINT']	2✔
122
123	orcids = get_orcids_for_push(record)	2✔
124	orcids_and_tokens = get_push_access_tokens(orcids)	2✔
125	for orcid, access_token in orcids_and_tokens:	2✔
126	push_to_orcid_task = Task()	1✔
127	push_to_orcid_task.name = task_name	1✔
128	push_to_orcid_task.apply_async(	1✔
129	queue='orcid_push',
130	kwargs={
131	'orcid': orcid,
132	'rec_id': record['control_number'],
133	'oauth_token': access_token,
134	},
135	)
136
137
138	#
139	# models_committed
140	#
141
142	@models_committed.connect	5✔
143	def index_after_commit(sender, changes):
144	"""Index a record in ES after it was committed to the DB.
145
146	This cannot happen in an ``after_record_commit`` receiver from Invenio-Records
147	because, despite the name, at that point we are not yet sure whether the record
148	has been really committed to the DB.
149	"""
150	indexer = RecordIndexer()	4✔
151
152	for model_instance, change in changes:	4✔
153	if isinstance(model_instance, RecordMetadata):	4✔
154	if change in ('insert', 'update'):	4✔
155	indexer.index(Record(model_instance.json, model_instance))	2✔
156	else:
157	indexer.delete(Record(model_instance.json, model_instance))	4✔
158
159
160	#
161	# before_record_index
162	#
163
164	@before_record_index.connect	5✔
165	def enhance_after_index(sender, json, args, *kwargs):
166	"""Run all the receivers that enhance the record for ES in the right order.
167
168	.. note::
169
170	``populate_recid_from_ref`` MUST come before ``populate_bookautocomplete``
171	because the latter puts a JSON reference in a completion payload, which
172	would be expanded to an incorrect ``payload_recid`` by the former.
173
174	"""
175	populate_recid_from_ref(sender, json, args, *kwargs)	2✔
176	populate_bookautocomplete(sender, json, args, *kwargs)	2✔
177	populate_abstract_source_suggest(sender, json, args, *kwargs)	2✔
178	populate_affiliation_suggest(sender, json, args, *kwargs)	2✔
179	populate_author_count(sender, json, args, *kwargs)	2✔
180	populate_authors_full_name_unicode_normalized(sender, json, args, *kwargs)	2✔
181	populate_earliest_date(sender, json, args, *kwargs)	2✔
182	populate_inspire_document_type(sender, json, args, *kwargs)	2✔
183	populate_name_variations(sender, json, args, *kwargs)	2✔
184	populate_title_suggest(sender, json, args, *kwargs)	2✔
185
186
187	def populate_bookautocomplete(sender, json, args, *kwargs):	5✔
188	"""Populate the ```bookautocomplete`` field of Literature records."""
189	if not is_hep(json):	3✔
190	return	3✔
191
192	if 'book' not in json.get('document_type', []):	3✔
193	return	3✔
194
195	paths = [	3✔
196	'imprints.date',
197	'imprints.publisher',
198	'isbns.value',
199	]
200
201	authors = force_list(get_value(json, 'authors.full_name', default=[]))	3✔
202	titles = force_list(get_value(json, 'titles.title', default=[]))	3✔
203
204	input_values = list(chain.from_iterable(	3✔
205	force_list(get_value(json, path, default=[])) for path in paths))
206	input_values.extend(authors)	3✔
207	input_values.extend(titles)	3✔
208	input_values = [el for el in input_values if el]	3✔
209
210	ref = get_value(json, 'self.$ref')	3✔
211
212	json.update({	3✔
213	'bookautocomplete': {
214	'input': input_values,
215	'payload': {
216	'authors': authors,
217	'id': ref,
218	'title': titles,
219	},
220	},
221	})
222
223
224	def populate_inspire_document_type(sender, json, args, *kwargs):	5✔
225	"""Populate the ``facet_inspire_doc_type`` field of Literature records."""
226	if not is_hep(json):	3✔
227	return	3✔
228
229	result = []	3✔
230
231	result.extend(json.get('document_type', []))	3✔
232	result.extend(json.get('publication_type', []))	3✔
233	if 'refereed' in json and json['refereed']:	3✔
234	result.append('peer reviewed')	3✔
235
236	json['facet_inspire_doc_type'] = result	3✔
237
238
239	def populate_recid_from_ref(sender, json, args, *kwargs):	5✔
240	"""Extract recids from all JSON reference fields and add them to ES.
241
242	For every field that has as a value a JSON reference, adds a sibling
243	after extracting the record identifier. Siblings are named by removing
244	``record`` occurrences and appending ``_recid`` without doubling or
245	prepending underscores to the original name.
246
247	Example::
248
249	{'record': {'$ref': 'http://x/y/2}}
250
251	is transformed to::
252
253	{
254	'recid': 2,
255	'record': {'$ref': 'http://x/y/2},
256	}
257
258	For every list of object references adds a new list with the
259	corresponding recids, whose name is similarly computed.
260
261	Example::
262
263	{
264	'records': [
265	{'$ref': 'http://x/y/1'},
266	{'$ref': 'http://x/y/2'},
267	],
268	}
269
270	is transformed to::
271
272	{
273	'recids': [1, 2],
274	'records': [
275	{'$ref': 'http://x/y/1'},
276	{'$ref': 'http://x/y/2'},
277	],
278	}
279
280	"""
281	list_ref_fields_translations = {	3✔
282	'deleted_records': 'deleted_recids'
283	}
284
285	def _recursive_find_refs(json_root):	3✔
286	if isinstance(json_root, list):	3✔
287	items = enumerate(json_root)	3✔
288	elif isinstance(json_root, dict):	3✔
289	# Note that items have to be generated before altering the dict.
290	# In this case, iteritems might break during iteration.
291	items = json_root.items()	3✔
292	else:
293	items = []	2✔
294
295	for key, value in items:	3✔
296	if (isinstance(json_root, dict) and isinstance(value, dict) and	3✔
297	'$ref' in value):
298	# Append '_recid' and remove 'record' from the key name.
299	key_basename = key.replace('record', '').rstrip('_')	3✔
300	new_key = '{}_recid'.format(key_basename).lstrip('_')	3✔
301	json_root[new_key] = get_recid_from_ref(value)	3✔
302	elif (isinstance(json_root, dict) and isinstance(value, list) and	3✔
303	key in list_ref_fields_translations):
304	new_list = [get_recid_from_ref(v) for v in value]	3✔
305	new_key = list_ref_fields_translations[key]	3✔
306	json_root[new_key] = new_list	3✔
307	else:
308	_recursive_find_refs(value)	3✔
309
310	_recursive_find_refs(json)	3✔
311
312
313	def populate_abstract_source_suggest(sender, json, args, *kwargs):	5✔
314	"""Populate the ``abstract_source_suggest`` field in Literature records."""
315	if not is_hep(json):	3✔
316	return	3✔
317
318	abstracts = json.get('abstracts', [])	3✔
319
320	for abstract in abstracts:	3✔
321	source = abstract.get('source')	3✔
322	if source:	3✔
323	abstract.update({	3✔
324	'abstract_source_suggest': {
325	'input': source,
326	'output': source,
327	},
328	})
329
330
331	def populate_title_suggest(sender, json, args, *kwargs):	5✔
332	"""Populate the ``title_suggest`` field of Journals records."""
333	if 'journals.json' not in json.get('$schema'):	3✔
334	return	3✔
335
336	journal_title = get_value(json, 'journal_title.title', default='')	3✔
337	short_title = json.get('short_title', '')	3✔
338	title_variants = json.get('title_variants', [])	3✔
339
340	input_values = []	3✔
341	input_values.append(journal_title)	3✔
342	input_values.append(short_title)	3✔
343	input_values.extend(title_variants)	3✔
344	input_values = [el for el in input_values if el]	3✔
345
346	json.update({	3✔
347	'title_suggest': {
348	'input': input_values,
349	'output': short_title if short_title else '',
350	'payload': {
351	'full_title': journal_title if journal_title else '',
352	},
353	}
354	})
355
356
357	def populate_affiliation_suggest(sender, json, args, *kwargs):	5✔
358	"""Populate the ``affiliation_suggest`` field of Institution records."""
359	if 'institutions.json' not in json.get('$schema'):	3✔
360	return	3✔
361
362	ICN = json.get('ICN', [])	3✔
363	institution_acronyms = get_value(json, 'institution_hierarchy.acronym', default=[])	3✔
364	institution_names = get_value(json, 'institution_hierarchy.name', default=[])	3✔
365	legacy_ICN = json.get('legacy_ICN', '')	3✔
366	name_variants = force_list(get_value(json, 'name_variants.value', default=[]))	3✔
367	postal_codes = force_list(get_value(json, 'addresses.postal_code', default=[]))	3✔
368
369	input_values = []	3✔
370	input_values.extend(ICN)	3✔
371	input_values.extend(institution_acronyms)	3✔
372	input_values.extend(institution_names)	3✔
373	input_values.append(legacy_ICN)	3✔
374	input_values.extend(name_variants)	3✔
375	input_values.extend(postal_codes)	3✔
376	input_values = [el for el in input_values if el]	3✔
377
378	json.update({	3✔
379	'affiliation_suggest': {
380	'input': input_values,
381	'output': legacy_ICN,
382	'payload': {
383	'$ref': get_value(json, 'self.$ref'),
384	'ICN': ICN,
385	'institution_acronyms': institution_acronyms,
386	'institution_names': institution_names,
387	'legacy_ICN': legacy_ICN,
388	},
389	},
390	})
391
392
393	def populate_earliest_date(sender, json, args, *kwargs):	5✔
394	"""Populate the ``earliest_date`` field of Literature records."""
395	if not is_hep(json):	3✔
396	return	3✔
397
398	date_paths = [	3✔
399	'preprint_date',
400	'thesis_info.date',
401	'thesis_info.defense_date',
402	'publication_info.year',
403	'legacy_creation_date',
404	'imprints.date',
405	]
406
407	dates = [str(el) for el in chain.from_iterable(	3✔
408	[force_list(get_value(json, path)) for path in date_paths])]
409
410	if dates:	3✔
411	result = earliest_date(dates)	3✔
412	if result:	3✔
413	json['earliest_date'] = result	3✔
414
415
416	def populate_name_variations(sender, json, args, *kwargs):	5✔
417	"""Generate name variations for each signature of a Literature record."""
418	if not is_hep(json):	2✔
419	return	2✔
420
421	authors = json.get('authors', [])	2✔
422
423	for author in authors:	2✔
424	full_name = author.get('full_name')	2✔
425	if full_name:	2✔
426	bais = [	2✔
427	el['value'] for el in author.get('ids', [])
428	if el['schema'] == 'INSPIRE BAI'
429	]
430	name_variations = generate_name_variations(full_name)	2✔
431
432	author.update({'name_variations': name_variations})	2✔
433	author.update({'name_suggest': {	2✔
434	'input': name_variations,
435	'output': full_name,
436	'payload': {'bai': bais[0] if bais else None}
437	}})
438
439
440	def populate_author_count(sender, json, args, *kwargs):	5✔
441	"""Populate the ``author_count`` field of Literature records."""
442	if not is_hep(json):	3✔
443	return	3✔
444
445	authors = json.get('authors', [])	3✔
446
447	authors_excluding_supervisors = [	3✔
448	author for author in authors
449	if 'supervisor' not in author.get('inspire_roles', [])
450	]
451	json['author_count'] = len(authors_excluding_supervisors)	3✔
452
453
454	def populate_authors_full_name_unicode_normalized(sender, json, args, *kwargs):	5✔
455	"""Populate the ``authors.full_name_normalized`` field of Literature records."""
456	if not is_hep(json):	3✔
457	return	3✔
458
459	authors = json.get('authors', [])	3✔
460
461	for index, author in enumerate(authors):	3✔
462	full_name = six.text_type(author['full_name'])	3✔
463	json['authors'][index].update({	3✔
464	'full_name_unicode_normalized': normalize('NFKC', full_name).lower()
465	})

inspirehep / inspire-next / 12566

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous