• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

HEPData / hepdata / 12163548050

04 Dec 2024 03:58PM UTC coverage: 83.629% (+0.2%) from 83.45%
12163548050

push

github

ItIsJordan
Fix bug in search_test.py

Fixes a bug where testing for MadAnalysis was checking against the database entry and not the document dictionary (as it should have been)

4572 of 5467 relevant lines covered (83.63%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.78
/hepdata/modules/inspire_api/parser.py
1
# This file is part of HEPData.
2
# Copyright (C) 2016 CERN.
3
#
4
# HEPData is free software; you can redistribute it
5
# and/or modify it under the terms of the GNU General Public License as
6
# published by the Free Software Foundation; either version 2 of the
7
# License, or (at your option) any later version.
8
#
9
# HEPData is distributed in the hope that it will be
10
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
# General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with HEPData; if not, write to the
16
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
17
# MA 02111-1307, USA.
18
#
19
# In applying this license, CERN does not
20
# waive the privileges and immunities granted to it by virtue of its status
21
# as an Intergovernmental Organization or submit itself to any jurisdiction.
22

23
"""Functions for parsing the new INSPIRE JSON metadata."""
24

25
from copy import deepcopy
1✔
26

27
parsed_content_defaults = {
1✔
28
    'title': None,
29
    'doi': None,
30
    'authors': None,
31
    'type': [],
32
    'abstract': 'None',
33
    'creation_date': None,
34
    'arxiv_id': None,
35
    'collaborations': [],
36
    'keywords': [],
37
    'journal_info': 'No Journal Information',
38
    'year': None,
39
    'subject_area': [],
40
}
41

42

43
def get_title(metadata):
1✔
44
    """Get the title of the publication from the first value in list of english translations (if applicable)
45
    otherwise the first title in the list of titles or preferably the first arXiv title in the list."""
46
    title = deepcopy(parsed_content_defaults['title'])
1✔
47
    if 'title_translations' in metadata.keys():
1✔
48
        for title_translation in metadata['title_translations']:
×
49
            if title_translation['language'] == 'en':
×
50
                title = title_translation['title']
×
51
    if title is parsed_content_defaults['title'] and 'titles' in metadata.keys() and len(metadata['titles']) > 0:
1✔
52
        title = metadata['titles'][0]['title']
1✔
53
        for _title in metadata['titles']:
1✔
54
            if 'title' in _title.keys() and 'source' in _title.keys() and _title['source'] == 'arXiv':
1✔
55
                title = _title['title']
1✔
56
                break
1✔
57
    return title
1✔
58

59

60
def get_doi(metadata):
1✔
61
    """Get the DOI of the journal publication from the first value in the list of DOIs."""
62
    doi = deepcopy(parsed_content_defaults['doi'])
1✔
63
    if 'dois' in metadata and len(metadata['dois']) > 0:
1✔
64
        doi = metadata['dois'][0]['value']
1✔
65
    return doi
1✔
66

67

68
def get_authors(metadata):
1✔
69
    """Get the authors of the publication as a list of dictionaries with keys 'affiliation' and 'full_name'."""
70
    authors = deepcopy(parsed_content_defaults['authors'])
1✔
71
    if 'authors' in metadata.keys():
1✔
72
        authors = [{'affiliation': (author['affiliations'][0]['value'] if 'affiliations' in author.keys() else ''),
1✔
73
                    'full_name': author['full_name']}
74
                   for author in metadata['authors']]
75
    return authors
1✔
76

77

78
def get_type(metadata):
1✔
79
    """Get the type of the publication."""
80
    _type = deepcopy(parsed_content_defaults['type'])
1✔
81
    if 'document_type' in metadata.keys():
1✔
82
        _type = metadata['document_type']
1✔
83
    return _type
1✔
84

85

86
def get_abstract(metadata):
1✔
87
    """Get the abstract of the publication, ideally the one from the arXiv version, otherwise the first one."""
88
    abstract = deepcopy(parsed_content_defaults['abstract'])
1✔
89
    if 'abstracts' in metadata.keys():
1✔
90
        abstract = metadata['abstracts'][0]['value']
1✔
91
        for _abstract in metadata['abstracts']:
1✔
92
            if 'value' in _abstract.keys() and 'source' in _abstract.keys() and _abstract['source'] == 'arXiv':
1✔
93
                abstract = _abstract['value']
1✔
94
                break
1✔
95
    return abstract
1✔
96

97

98
def get_creation_date(metadata):
1✔
99
    """Get the creation date of the publication, first try to expand the preprint_date, otherwise try legacy_creation_date."""
100
    creation_date = deepcopy(parsed_content_defaults['creation_date'])
1✔
101
    if 'preprint_date' in metadata.keys():
1✔
102
        creation_date = expand_date(metadata['preprint_date'])
1✔
103
    elif 'legacy_creation_date' in metadata:
1✔
104
        creation_date = metadata['legacy_creation_date']
1✔
105
    return creation_date
1✔
106

107

108
def get_arxiv_id(metadata):
1✔
109
    """Get the arxiv id of the publication from the last value in the list of arxiv eprints."""
110
    arxiv_id = deepcopy(parsed_content_defaults['arxiv_id'])
1✔
111
    if 'arxiv_eprints' in metadata.keys():
1✔
112
        arxiv_id = 'arXiv:' + metadata['arxiv_eprints'][-1]['value']
1✔
113
    return arxiv_id
1✔
114

115

116
def get_collaborations(metadata):
1✔
117
    """Get the collaborations of the publication as a list."""
118
    collaborations = deepcopy(parsed_content_defaults['collaborations'])
1✔
119
    if 'collaborations' in metadata:
1✔
120
        collaborations = [collaboration['value'] for collaboration in metadata['collaborations']]
1✔
121
    return collaborations
1✔
122

123

124
def get_keywords(metadata):
1✔
125
    """Get the keywords of the publication."""
126
    keywords = deepcopy(parsed_content_defaults['keywords'])
1✔
127
    if 'keywords' in metadata.keys():
1✔
128
        keywords = metadata['keywords']
1✔
129
    return keywords
1✔
130

131

132
def get_journal_info(metadata):
1✔
133
    """
134
    Get the journal information of the publication. Format is 'title volume (year) article page_start-page_end' if at least one of these information is available,
135
    otherwise attempt to obtain it from 'pubinfo_freetext' or 'publication_info' or 'report_numbers' or 'public_notes'. Defaults to 'No Journal Information'.
136
    """
137
    default_journal_info, journal_info = deepcopy(parsed_content_defaults['journal_info']), ''
1✔
138
    if 'publication_info' in metadata:
1✔
139
        if 'journal_title' in metadata['publication_info'][0].keys():
1✔
140
            journal_info += metadata['publication_info'][0]['journal_title'] + ' '
1✔
141
        if 'journal_volume' in metadata['publication_info'][0].keys():
1✔
142
            journal_info += metadata['publication_info'][0]['journal_volume'] + ' '
1✔
143
        if 'year' in metadata['publication_info'][0].keys():
1✔
144
            journal_info += '(' + str(metadata['publication_info'][0]['year']) + ') '
1✔
145
        if 'artid' in metadata['publication_info'][0].keys():
1✔
146
            journal_info += metadata['publication_info'][0]['artid'] + ' '
1✔
147
        if 'page_start' in metadata['publication_info'][0].keys() and 'page_end' in metadata['publication_info'][0].keys():
1✔
148
            journal_info += metadata['publication_info'][0]['page_start'] + "-" + metadata['publication_info'][0]['page_end']
1✔
149
        if journal_info != '':
1✔
150
            journal_info = journal_info.strip()  # trim to remove whitespace
1✔
151
            return journal_info
1✔
152
    if ('publication_info' in metadata and len(metadata['publication_info']) > 0 and type(metadata['publication_info'][0]) is dict and
1✔
153
       'pubinfo_freetext' in metadata['publication_info'][0].keys()):
154
        journal_info = metadata['publication_info'][0]['pubinfo_freetext']
×
155
    elif 'report_numbers' in metadata and len(metadata['report_numbers']) > 0:
1✔
156
        journal_info = metadata['report_numbers'][0]['value']
1✔
157
    elif ('public_notes' in metadata.keys() and any(['value' in public_note.keys() and "Submitted to " in public_note['value'] for public_note in metadata['public_notes']])):
1✔
158
        journal_info = [public_note['value'].replace("Submitted to ", "") for public_note in metadata['public_notes'] if
×
159
                        ('value' in public_note.keys() and "Submitted to " in public_note['value'])][0]
160
    if '. All figures' in journal_info:
1✔
161
        journal_info = journal_info.replace('. All figures', '')
×
162
    if journal_info != '':
1✔
163
        return journal_info
1✔
164
    else:
165
        return default_journal_info
1✔
166

167

168
def get_year(metadata):
1✔
169
    """Get the year of the publication. Try first 'imprints/date', then 'publication_info/year', then 'preprint_date', and finally 'legacy_creation_date'."""
170
    year = deepcopy(parsed_content_defaults['year'])
1✔
171
    if 'imprints' in metadata.keys() and any(['date' in imprint.keys() and len(imprint['date']) == 4 for imprint in metadata['imprints']]):
1✔
172
        year = [imprint['date'] for imprint in metadata['imprints'] if 'date' in imprint.keys() and len(imprint['date']) == 4][0]
1✔
173
    elif ('publication_info' in metadata and 'year' in metadata['publication_info'][0].keys()):
1✔
174
        year = str(metadata['publication_info'][0]['year'])
1✔
175
    elif 'preprint_date' in metadata.keys():
1✔
176
        year = metadata['preprint_date'].split("-")[0]
1✔
177
    elif 'legacy_creation_date' in metadata:
1✔
178
        year = metadata['legacy_creation_date'].split("-")[0]
1✔
179
    return year
1✔
180

181

182
def get_subject_area(metadata):
1✔
183
    subject_area = deepcopy(parsed_content_defaults['subject_area'])
1✔
184
    if 'arxiv_eprints' in metadata.keys():
1✔
185
        subject_area += metadata['arxiv_eprints'][-1]['categories']
1✔
186
    if ('inspire_categories' in metadata.keys() and len(metadata['inspire_categories']) > 0):
1✔
187
        subject_area += [entry['term'].replace('Experiment-HEP', 'hep-ex').replace('Experiment-Nucl', 'nucl-ex').replace('Theory-Nucl', 'nucl-th') for
1✔
188
                         entry in metadata['inspire_categories'] if 'term' in entry.keys() and entry['term'] != 'Other']
189
    subject_area = list(set(subject_area))
1✔
190
    return subject_area
1✔
191

192

193
def updated_parsed_content_for_thesis(content, parsed_content):
1✔
194
    parsed_content['dissertation'] = content['metadata']['thesis_info']
1✔
195
    # fix dissertation/institutions -> dissertation/institution if there is only one
196
    if ('institutions' in parsed_content['dissertation'].keys() and
1✔
197
       len(parsed_content['dissertation']['institutions']) == 1 and
198
       'name' in parsed_content['dissertation']['institutions'][0]):
199
        parsed_content['dissertation']['institution'] = parsed_content['dissertation']['institutions'][0]['name']
1✔
200
        parsed_content['dissertation'].pop('institutions')
1✔
201
    # update year with thesis info
202
    if 'date' in content['metadata']['thesis_info'].keys():
1✔
203
        parsed_content['year'] = content['metadata']['thesis_info']['date']
1✔
204
        if parsed_content['year'] is not None:
1✔
205
            if ('legacy_creation_date' in content['metadata'].keys() and
1✔
206
                    content['metadata']['legacy_creation_date'][:4] == parsed_content['year']):
207
                parsed_content['creation_date'] = content['metadata']['legacy_creation_date']
×
208
            else:
209
                parsed_content['creation_date'] = expand_date(parsed_content['year'])
1✔
210
    # fix capitals in dissertation/type
211
    if 'degree_type' in parsed_content['dissertation'].keys():
1✔
212
        parsed_content['dissertation']['type'] = parsed_content['dissertation'].pop('degree_type').title()
1✔
213
        if parsed_content['dissertation']['type'] == "Phd":
1✔
214
            parsed_content['dissertation']['type'] = "PhD"
1✔
215
    # fix dissertation/defence_date string
216
    if 'date' in parsed_content['dissertation'].keys():
1✔
217
        parsed_content['dissertation']['defense_date'] = parsed_content['dissertation'].pop('date')
1✔
218
    return parsed_content
1✔
219

220

221
def expand_date(value):
1✔
222
    """
223
    In the case where the date is not completely
224
    formed, we need to expand it out.
225
    so 2012-08 will be 2012-08-01
226
    and 2012 will be 2012-01-01.
227
    If nothing, we do nothing.
228
    """
229
    if value == '':
1✔
230
        return value
1✔
231

232
    date_parts = value.split('-')
1✔
233

234
    if len(date_parts) == 1:
1✔
235
        date_parts.append('01')
1✔
236
    if len(date_parts) == 2:
1✔
237
        date_parts.append('01')
1✔
238
    return "-".join(date_parts)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc