12163548050

Committed 04 Dec 2024 03:58PM UTC coverage: 83.629% (+0.2%) from 83.45%

Build # 12163548050

Build Type

push

github

Committed by

ItIsJordan

Commit Message

Fix bug in search_test.py

Fixes a bug where testing for MadAnalysis was checking against the database entry and not the document dictionary (as it should have been)

Run Details

4572 of 5467 relevant lines covered (83.63%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.78

/hepdata/modules/inspire_api/parser.py

# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Functions for parsing the new INSPIRE JSON metadata."""

from copy import deepcopy

parsed_content_defaults = {
    'title': None,
    'doi': None,
    'authors': None,
    'type': [],
    'abstract': 'None',
    'creation_date': None,
    'arxiv_id': None,
    'collaborations': [],
    'keywords': [],
    'journal_info': 'No Journal Information',
    'year': None,
    'subject_area': [],
}


def get_title(metadata):
    """Get the title of the publication from the first value in list of english translations (if applicable)
    otherwise the first title in the list of titles or preferably the first arXiv title in the list."""
    title = deepcopy(parsed_content_defaults['title'])
    if 'title_translations' in metadata.keys():
        for title_translation in metadata['title_translations']:
            if title_translation['language'] == 'en':
                title = title_translation['title']
    if title is parsed_content_defaults['title'] and 'titles' in metadata.keys() and len(metadata['titles']) > 0:
        title = metadata['titles'][0]['title']
        for _title in metadata['titles']:
            if 'title' in _title.keys() and 'source' in _title.keys() and _title['source'] == 'arXiv':
                title = _title['title']
                break
    return title


def get_doi(metadata):
    """Get the DOI of the journal publication from the first value in the list of DOIs."""
    doi = deepcopy(parsed_content_defaults['doi'])
    if 'dois' in metadata and len(metadata['dois']) > 0:
        doi = metadata['dois'][0]['value']
    return doi


def get_authors(metadata):
    """Get the authors of the publication as a list of dictionaries with keys 'affiliation' and 'full_name'."""
    authors = deepcopy(parsed_content_defaults['authors'])
    if 'authors' in metadata.keys():
        authors = [{'affiliation': (author['affiliations'][0]['value'] if 'affiliations' in author.keys() else ''),
                    'full_name': author['full_name']}
                   for author in metadata['authors']]
    return authors


def get_type(metadata):
    """Get the type of the publication."""
    _type = deepcopy(parsed_content_defaults['type'])
    if 'document_type' in metadata.keys():
        _type = metadata['document_type']
    return _type


def get_abstract(metadata):
    """Get the abstract of the publication, ideally the one from the arXiv version, otherwise the first one."""
    abstract = deepcopy(parsed_content_defaults['abstract'])
    if 'abstracts' in metadata.keys():
        abstract = metadata['abstracts'][0]['value']
        for _abstract in metadata['abstracts']:
            if 'value' in _abstract.keys() and 'source' in _abstract.keys() and _abstract['source'] == 'arXiv':
                abstract = _abstract['value']
                break
    return abstract


def get_creation_date(metadata):
    """Get the creation date of the publication, first try to expand the preprint_date, otherwise try legacy_creation_date."""
    creation_date = deepcopy(parsed_content_defaults['creation_date'])
    if 'preprint_date' in metadata.keys():
        creation_date = expand_date(metadata['preprint_date'])
    elif 'legacy_creation_date' in metadata:
        creation_date = metadata['legacy_creation_date']
    return creation_date


def get_arxiv_id(metadata):
    """Get the arxiv id of the publication from the last value in the list of arxiv eprints."""
    arxiv_id = deepcopy(parsed_content_defaults['arxiv_id'])
    if 'arxiv_eprints' in metadata.keys():
        arxiv_id = 'arXiv:' + metadata['arxiv_eprints'][-1]['value']
    return arxiv_id


def get_collaborations(metadata):
    """Get the collaborations of the publication as a list."""
    collaborations = deepcopy(parsed_content_defaults['collaborations'])
    if 'collaborations' in metadata:
        collaborations = [collaboration['value'] for collaboration in metadata['collaborations']]
    return collaborations


def get_keywords(metadata):
    """Get the keywords of the publication."""
    keywords = deepcopy(parsed_content_defaults['keywords'])
    if 'keywords' in metadata.keys():
        keywords = metadata['keywords']
    return keywords


def get_journal_info(metadata):
    """
    Get the journal information of the publication. Format is 'title volume (year) article page_start-page_end' if at least one of these information is available,
    otherwise attempt to obtain it from 'pubinfo_freetext' or 'publication_info' or 'report_numbers' or 'public_notes'. Defaults to 'No Journal Information'.
    """
    default_journal_info, journal_info = deepcopy(parsed_content_defaults['journal_info']), ''
    if 'publication_info' in metadata:
        if 'journal_title' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['journal_title'] + ' '
        if 'journal_volume' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['journal_volume'] + ' '
        if 'year' in metadata['publication_info'][0].keys():
            journal_info += '(' + str(metadata['publication_info'][0]['year']) + ') '
        if 'artid' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['artid'] + ' '
        if 'page_start' in metadata['publication_info'][0].keys() and 'page_end' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['page_start'] + "-" + metadata['publication_info'][0]['page_end']
        if journal_info != '':
            journal_info = journal_info.strip()  # trim to remove whitespace
            return journal_info
    if ('publication_info' in metadata and len(metadata['publication_info']) > 0 and type(metadata['publication_info'][0]) is dict and
       'pubinfo_freetext' in metadata['publication_info'][0].keys()):
        journal_info = metadata['publication_info'][0]['pubinfo_freetext']
    elif 'report_numbers' in metadata and len(metadata['report_numbers']) > 0:
        journal_info = metadata['report_numbers'][0]['value']
    elif ('public_notes' in metadata.keys() and any(['value' in public_note.keys() and "Submitted to " in public_note['value'] for public_note in metadata['public_notes']])):
        journal_info = [public_note['value'].replace("Submitted to ", "") for public_note in metadata['public_notes'] if
                        ('value' in public_note.keys() and "Submitted to " in public_note['value'])][0]
    if '. All figures' in journal_info:
        journal_info = journal_info.replace('. All figures', '')
    if journal_info != '':
        return journal_info
    else:
        return default_journal_info


def get_year(metadata):
    """Get the year of the publication. Try first 'imprints/date', then 'publication_info/year', then 'preprint_date', and finally 'legacy_creation_date'."""
    year = deepcopy(parsed_content_defaults['year'])
    if 'imprints' in metadata.keys() and any(['date' in imprint.keys() and len(imprint['date']) == 4 for imprint in metadata['imprints']]):
        year = [imprint['date'] for imprint in metadata['imprints'] if 'date' in imprint.keys() and len(imprint['date']) == 4][0]
    elif ('publication_info' in metadata and 'year' in metadata['publication_info'][0].keys()):
        year = str(metadata['publication_info'][0]['year'])
    elif 'preprint_date' in metadata.keys():
        year = metadata['preprint_date'].split("-")[0]
    elif 'legacy_creation_date' in metadata:
        year = metadata['legacy_creation_date'].split("-")[0]
    return year


def get_subject_area(metadata):
    subject_area = deepcopy(parsed_content_defaults['subject_area'])
    if 'arxiv_eprints' in metadata.keys():
        subject_area += metadata['arxiv_eprints'][-1]['categories']
    if ('inspire_categories' in metadata.keys() and len(metadata['inspire_categories']) > 0):
        subject_area += [entry['term'].replace('Experiment-HEP', 'hep-ex').replace('Experiment-Nucl', 'nucl-ex').replace('Theory-Nucl', 'nucl-th') for
                         entry in metadata['inspire_categories'] if 'term' in entry.keys() and entry['term'] != 'Other']
    subject_area = list(set(subject_area))
    return subject_area


def updated_parsed_content_for_thesis(content, parsed_content):
    parsed_content['dissertation'] = content['metadata']['thesis_info']
    # fix dissertation/institutions -> dissertation/institution if there is only one
    if ('institutions' in parsed_content['dissertation'].keys() and
       len(parsed_content['dissertation']['institutions']) == 1 and
       'name' in parsed_content['dissertation']['institutions'][0]):
        parsed_content['dissertation']['institution'] = parsed_content['dissertation']['institutions'][0]['name']
        parsed_content['dissertation'].pop('institutions')
    # update year with thesis info
    if 'date' in content['metadata']['thesis_info'].keys():
        parsed_content['year'] = content['metadata']['thesis_info']['date']
        if parsed_content['year'] is not None:
            if ('legacy_creation_date' in content['metadata'].keys() and
                    content['metadata']['legacy_creation_date'][:4] == parsed_content['year']):
                parsed_content['creation_date'] = content['metadata']['legacy_creation_date']
            else:
                parsed_content['creation_date'] = expand_date(parsed_content['year'])
    # fix capitals in dissertation/type
    if 'degree_type' in parsed_content['dissertation'].keys():
        parsed_content['dissertation']['type'] = parsed_content['dissertation'].pop('degree_type').title()
        if parsed_content['dissertation']['type'] == "Phd":
            parsed_content['dissertation']['type'] = "PhD"
    # fix dissertation/defence_date string
    if 'date' in parsed_content['dissertation'].keys():
        parsed_content['dissertation']['defense_date'] = parsed_content['dissertation'].pop('date')
    return parsed_content


def expand_date(value):
    """
    In the case where the date is not completely
    formed, we need to expand it out.
    so 2012-08 will be 2012-08-01
    and 2012 will be 2012-01-01.
    If nothing, we do nothing.
    """
    if value == '':
        return value

    date_parts = value.split('-')

    if len(date_parts) == 1:
        date_parts.append('01')
    if len(date_parts) == 2:
        date_parts.append('01')
    return "-".join(date_parts)

1	# This file is part of HEPData.
2	# Copyright (C) 2016 CERN.
3	#
4	# HEPData is free software; you can redistribute it
5	# and/or modify it under the terms of the GNU General Public License as
6	# published by the Free Software Foundation; either version 2 of the
7	# License, or (at your option) any later version.
8	#
9	# HEPData is distributed in the hope that it will be
10	# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	# General Public License for more details.
13	#
14	# You should have received a copy of the GNU General Public License
15	# along with HEPData; if not, write to the
16	# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
17	# MA 02111-1307, USA.
18	#
19	# In applying this license, CERN does not
20	# waive the privileges and immunities granted to it by virtue of its status
21	# as an Intergovernmental Organization or submit itself to any jurisdiction.
22
23	"""Functions for parsing the new INSPIRE JSON metadata."""
24
25	from copy import deepcopy	1✔
26
27	parsed_content_defaults = {	1✔
28	'title': None,
29	'doi': None,
30	'authors': None,
31	'type': [],
32	'abstract': 'None',
33	'creation_date': None,
34	'arxiv_id': None,
35	'collaborations': [],
36	'keywords': [],
37	'journal_info': 'No Journal Information',
38	'year': None,
39	'subject_area': [],
40	}
41
42
43	def get_title(metadata):	1✔
44	"""Get the title of the publication from the first value in list of english translations (if applicable)
45	otherwise the first title in the list of titles or preferably the first arXiv title in the list."""
46	title = deepcopy(parsed_content_defaults['title'])	1✔
47	if 'title_translations' in metadata.keys():	1✔
48	for title_translation in metadata['title_translations']:	×
49	if title_translation['language'] == 'en':	×
50	title = title_translation['title']	×
51	if title is parsed_content_defaults['title'] and 'titles' in metadata.keys() and len(metadata['titles']) > 0:	1✔
52	title = metadata['titles'][0]['title']	1✔
53	for _title in metadata['titles']:	1✔
54	if 'title' in _title.keys() and 'source' in _title.keys() and _title['source'] == 'arXiv':	1✔
55	title = _title['title']	1✔
56	break	1✔
57	return title	1✔
58
59
60	def get_doi(metadata):	1✔
61	"""Get the DOI of the journal publication from the first value in the list of DOIs."""
62	doi = deepcopy(parsed_content_defaults['doi'])	1✔
63	if 'dois' in metadata and len(metadata['dois']) > 0:	1✔
64	doi = metadata['dois'][0]['value']	1✔
65	return doi	1✔
66
67
68	def get_authors(metadata):	1✔
69	"""Get the authors of the publication as a list of dictionaries with keys 'affiliation' and 'full_name'."""
70	authors = deepcopy(parsed_content_defaults['authors'])	1✔
71	if 'authors' in metadata.keys():	1✔
72	authors = [{'affiliation': (author['affiliations'][0]['value'] if 'affiliations' in author.keys() else ''),	1✔
73	'full_name': author['full_name']}
74	for author in metadata['authors']]
75	return authors	1✔
76
77
78	def get_type(metadata):	1✔
79	"""Get the type of the publication."""
80	_type = deepcopy(parsed_content_defaults['type'])	1✔
81	if 'document_type' in metadata.keys():	1✔
82	_type = metadata['document_type']	1✔
83	return _type	1✔
84
85
86	def get_abstract(metadata):	1✔
87	"""Get the abstract of the publication, ideally the one from the arXiv version, otherwise the first one."""
88	abstract = deepcopy(parsed_content_defaults['abstract'])	1✔
89	if 'abstracts' in metadata.keys():	1✔
90	abstract = metadata['abstracts'][0]['value']	1✔
91	for _abstract in metadata['abstracts']:	1✔
92	if 'value' in _abstract.keys() and 'source' in _abstract.keys() and _abstract['source'] == 'arXiv':	1✔
93	abstract = _abstract['value']	1✔
94	break	1✔
95	return abstract	1✔
96
97
98	def get_creation_date(metadata):	1✔
99	"""Get the creation date of the publication, first try to expand the preprint_date, otherwise try legacy_creation_date."""
100	creation_date = deepcopy(parsed_content_defaults['creation_date'])	1✔
101	if 'preprint_date' in metadata.keys():	1✔
102	creation_date = expand_date(metadata['preprint_date'])	1✔
103	elif 'legacy_creation_date' in metadata:	1✔
104	creation_date = metadata['legacy_creation_date']	1✔
105	return creation_date	1✔
106
107
108	def get_arxiv_id(metadata):	1✔
109	"""Get the arxiv id of the publication from the last value in the list of arxiv eprints."""
110	arxiv_id = deepcopy(parsed_content_defaults['arxiv_id'])	1✔
111	if 'arxiv_eprints' in metadata.keys():	1✔
112	arxiv_id = 'arXiv:' + metadata['arxiv_eprints'][-1]['value']	1✔
113	return arxiv_id	1✔
114
115
116	def get_collaborations(metadata):	1✔
117	"""Get the collaborations of the publication as a list."""
118	collaborations = deepcopy(parsed_content_defaults['collaborations'])	1✔
119	if 'collaborations' in metadata:	1✔
120	collaborations = [collaboration['value'] for collaboration in metadata['collaborations']]	1✔
121	return collaborations	1✔
122
123
124	def get_keywords(metadata):	1✔
125	"""Get the keywords of the publication."""
126	keywords = deepcopy(parsed_content_defaults['keywords'])	1✔
127	if 'keywords' in metadata.keys():	1✔
128	keywords = metadata['keywords']	1✔
129	return keywords	1✔
130
131
132	def get_journal_info(metadata):	1✔
133	"""
134	Get the journal information of the publication. Format is 'title volume (year) article page_start-page_end' if at least one of these information is available,
135	otherwise attempt to obtain it from 'pubinfo_freetext' or 'publication_info' or 'report_numbers' or 'public_notes'. Defaults to 'No Journal Information'.
136	"""
137	default_journal_info, journal_info = deepcopy(parsed_content_defaults['journal_info']), ''	1✔
138	if 'publication_info' in metadata:	1✔
139	if 'journal_title' in metadata['publication_info'][0].keys():	1✔
140	journal_info += metadata['publication_info'][0]['journal_title'] + ' '	1✔
141	if 'journal_volume' in metadata['publication_info'][0].keys():	1✔
142	journal_info += metadata['publication_info'][0]['journal_volume'] + ' '	1✔
143	if 'year' in metadata['publication_info'][0].keys():	1✔
144	journal_info += '(' + str(metadata['publication_info'][0]['year']) + ') '	1✔
145	if 'artid' in metadata['publication_info'][0].keys():	1✔
146	journal_info += metadata['publication_info'][0]['artid'] + ' '	1✔
147	if 'page_start' in metadata['publication_info'][0].keys() and 'page_end' in metadata['publication_info'][0].keys():	1✔
148	journal_info += metadata['publication_info'][0]['page_start'] + "-" + metadata['publication_info'][0]['page_end']	1✔
149	if journal_info != '':	1✔
150	journal_info = journal_info.strip() # trim to remove whitespace	1✔
151	return journal_info	1✔
152	if ('publication_info' in metadata and len(metadata['publication_info']) > 0 and type(metadata['publication_info'][0]) is dict and	1✔
153	'pubinfo_freetext' in metadata['publication_info'][0].keys()):
154	journal_info = metadata['publication_info'][0]['pubinfo_freetext']	×
155	elif 'report_numbers' in metadata and len(metadata['report_numbers']) > 0:	1✔
156	journal_info = metadata['report_numbers'][0]['value']	1✔
157	elif ('public_notes' in metadata.keys() and any(['value' in public_note.keys() and "Submitted to " in public_note['value'] for public_note in metadata['public_notes']])):	1✔
158	journal_info = [public_note['value'].replace("Submitted to ", "") for public_note in metadata['public_notes'] if	×
159	('value' in public_note.keys() and "Submitted to " in public_note['value'])][0]
160	if '. All figures' in journal_info:	1✔
161	journal_info = journal_info.replace('. All figures', '')	×
162	if journal_info != '':	1✔
163	return journal_info	1✔
164	else:
165	return default_journal_info	1✔
166
167
168	def get_year(metadata):	1✔
169	"""Get the year of the publication. Try first 'imprints/date', then 'publication_info/year', then 'preprint_date', and finally 'legacy_creation_date'."""
170	year = deepcopy(parsed_content_defaults['year'])	1✔
171	if 'imprints' in metadata.keys() and any(['date' in imprint.keys() and len(imprint['date']) == 4 for imprint in metadata['imprints']]):	1✔
172	year = [imprint['date'] for imprint in metadata['imprints'] if 'date' in imprint.keys() and len(imprint['date']) == 4][0]	1✔
173	elif ('publication_info' in metadata and 'year' in metadata['publication_info'][0].keys()):	1✔
174	year = str(metadata['publication_info'][0]['year'])	1✔
175	elif 'preprint_date' in metadata.keys():	1✔
176	year = metadata['preprint_date'].split("-")[0]	1✔
177	elif 'legacy_creation_date' in metadata:	1✔
178	year = metadata['legacy_creation_date'].split("-")[0]	1✔
179	return year	1✔
180
181
182	def get_subject_area(metadata):	1✔
183	subject_area = deepcopy(parsed_content_defaults['subject_area'])	1✔
184	if 'arxiv_eprints' in metadata.keys():	1✔
185	subject_area += metadata['arxiv_eprints'][-1]['categories']	1✔
186	if ('inspire_categories' in metadata.keys() and len(metadata['inspire_categories']) > 0):	1✔
187	subject_area += [entry['term'].replace('Experiment-HEP', 'hep-ex').replace('Experiment-Nucl', 'nucl-ex').replace('Theory-Nucl', 'nucl-th') for	1✔
188	entry in metadata['inspire_categories'] if 'term' in entry.keys() and entry['term'] != 'Other']
189	subject_area = list(set(subject_area))	1✔
190	return subject_area	1✔
191
192
193	def updated_parsed_content_for_thesis(content, parsed_content):	1✔
194	parsed_content['dissertation'] = content['metadata']['thesis_info']	1✔
195	# fix dissertation/institutions -> dissertation/institution if there is only one
196	if ('institutions' in parsed_content['dissertation'].keys() and	1✔
197	len(parsed_content['dissertation']['institutions']) == 1 and
198	'name' in parsed_content['dissertation']['institutions'][0]):
199	parsed_content['dissertation']['institution'] = parsed_content['dissertation']['institutions'][0]['name']	1✔
200	parsed_content['dissertation'].pop('institutions')	1✔
201	# update year with thesis info
202	if 'date' in content['metadata']['thesis_info'].keys():	1✔
203	parsed_content['year'] = content['metadata']['thesis_info']['date']	1✔
204	if parsed_content['year'] is not None:	1✔
205	if ('legacy_creation_date' in content['metadata'].keys() and	1✔
206	content['metadata']['legacy_creation_date'][:4] == parsed_content['year']):
207	parsed_content['creation_date'] = content['metadata']['legacy_creation_date']	×
208	else:
209	parsed_content['creation_date'] = expand_date(parsed_content['year'])	1✔
210	# fix capitals in dissertation/type
211	if 'degree_type' in parsed_content['dissertation'].keys():	1✔
212	parsed_content['dissertation']['type'] = parsed_content['dissertation'].pop('degree_type').title()	1✔
213	if parsed_content['dissertation']['type'] == "Phd":	1✔
214	parsed_content['dissertation']['type'] = "PhD"	1✔
215	# fix dissertation/defence_date string
216	if 'date' in parsed_content['dissertation'].keys():	1✔
217	parsed_content['dissertation']['defense_date'] = parsed_content['dissertation'].pop('date')	1✔
218	return parsed_content	1✔
219
220
221	def expand_date(value):	1✔
222	"""
223	In the case where the date is not completely
224	formed, we need to expand it out.
225	so 2012-08 will be 2012-08-01
226	and 2012 will be 2012-01-01.
227	If nothing, we do nothing.
228	"""
229	if value == '':	1✔
230	return value	1✔
231
232	date_parts = value.split('-')	1✔
233
234	if len(date_parts) == 1:	1✔
235	date_parts.append('01')	1✔
236	if len(date_parts) == 2:	1✔
237	date_parts.append('01')	1✔
238	return "-".join(date_parts)	1✔

HEPData / hepdata / 12163548050

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous