• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

inspirehep / inspire-next / 12564

pending completion
12564

Pull #3460

travis-ci

web-flow
disambiguation: add signature pairs sampling

Since INSPIRE has ~3M curated signatures it would take too much time
to train on all possible pairs, so we sample 1M pairs in such a way
that they are representative of the known clusters structure.

Signed-off-by: Jacopo Notarstefano <jacopo.notarstefano@gmail.com>
Pull Request #3460: WIP disambiguation: add signature pairs sampling

60 of 60 new or added lines in 3 files covered. (100.0%)

7639 of 9664 relevant lines covered (79.05%)

2.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

65.45
/inspirehep/modules/hal/core/tei.py
1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of INSPIRE.
4
# Copyright (C) 2014-2017 CERN.
5
#
6
# INSPIRE is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# INSPIRE is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
18
#
19
# In applying this license, CERN does not waive the privileges and immunities
20
# granted to it by virtue of its status as an Intergovernmental Organization
21
# or submit itself to any jurisdiction.
22

23
"""HAL TEI core."""
5✔
24

25
from __future__ import absolute_import, division, print_function
5✔
26

27
from flask import render_template
5✔
28
from langdetect import detect
5✔
29
from langdetect.lang_detect_exception import LangDetectException
5✔
30

31
from inspirehep.utils.record import (
5✔
32
    get_abstract,
33
    get_arxiv_id,
34
    get_collaborations,
35
    get_keywords,
36
    get_subtitle,
37
    get_title,
38
)
39

40
from ..utils import (
5✔
41
    get_authors,
42
    get_conference_city,
43
    get_conference_country,
44
    get_conference_end_date,
45
    get_conference_record,
46
    get_conference_start_date,
47
    get_conference_title,
48
    get_divulgation,
49
    get_document_types,
50
    get_doi,
51
    get_domains,
52
    get_inspire_id,
53
    get_journal_issue,
54
    get_journal_title,
55
    get_journal_volume,
56
    get_language,
57
    get_page_artid,
58
    get_peer_reviewed,
59
    get_publication_date,
60
    is_published,
61
)
62

63

64
def convert_to_tei(record):
5✔
65
    """Return the record formatted in XML+TEI per HAL's specification.
66

67
    Args:
68
        record(InspireRecord): a record.
69

70
    Returns:
71
        string: the record formatted in XML+TEI.
72

73
    Examples:
74
        >>> record = get_db_record('lit', 1407506)
75
        >>> convert_to_tei(record)
76
        <?xml version="1.0" encoding="UTF-8"?>
77
        ...
78

79
    """
80
    if _is_comm(record):
1✔
81
        ctx = _get_comm_context(record)
×
82
        return render_template('hal/comm.xml', **ctx)
×
83
    elif _is_art(record):
1✔
84
        ctx = _get_art_context(record)
1✔
85
        return render_template('hal/art.xml', **ctx)
1✔
86
    elif _is_preprint(record):
1✔
87
        ctx = _get_preprint_context(record)
1✔
88
        return render_template('hal/preprint.xml', **ctx)
1✔
89

90
    raise NotImplementedError
×
91

92

93
def _is_comm(record):
5✔
94
    document_types = get_document_types(record)
2✔
95

96
    return 'conference paper' in document_types
2✔
97

98

99
def _get_comm_context(record):
5✔
100
    abstract = get_abstract(record)
×
101
    try:
×
102
        abstract_language = detect(abstract)
×
103
    except LangDetectException:
×
104
        abstract_language = ''
×
105

106
    conference_record = get_conference_record(record)
×
107
    conference_city = get_conference_city(conference_record)
×
108
    conference_country = get_conference_country(conference_record)
×
109
    conference_end_date = get_conference_end_date(conference_record)
×
110
    conference_start_date = get_conference_start_date(conference_record)
×
111
    conference_title = get_conference_title(conference_record)
×
112

113
    return {
×
114
        'abstract': abstract,
115
        'abstract_language': abstract_language,
116
        'arxiv_id': get_arxiv_id(record),
117
        'authors': get_authors(record),
118
        'collaborations': get_collaborations(record),
119
        'conference_city': conference_city,
120
        'conference_country': conference_country,
121
        'conference_end_date': conference_end_date,
122
        'conference_start_date': conference_start_date,
123
        'conference_title': conference_title,
124
        'divulgation': get_divulgation(record),
125
        'doi': get_doi(record),
126
        'domains': get_domains(record),
127
        'inspire_id': get_inspire_id(record),
128
        'journal_issue': get_journal_issue(record),
129
        'journal_title': get_journal_title(record),
130
        'journal_volume': get_journal_volume(record),
131
        'keywords': get_keywords(record),
132
        'language': get_language(record),
133
        'page_artid': get_page_artid(record),
134
        'peer_reviewed': get_peer_reviewed(record),
135
        'publication_date': get_publication_date(record),
136
        'subtitle': get_subtitle(record),
137
        'title': get_title(record),
138
    }
139

140

141
def _is_art(record):
5✔
142
    document_types = get_document_types(record)
2✔
143
    published = is_published(record)
2✔
144

145
    return 'article' in document_types and published
2✔
146

147

148
def _get_art_context(record):
5✔
149
    abstract = get_abstract(record)
1✔
150
    try:
1✔
151
        abstract_language = detect(abstract)
1✔
152
    except LangDetectException:
×
153
        abstract_language = ''
×
154

155
    return {
1✔
156
        'abstract': abstract,
157
        'abstract_language': abstract_language,
158
        'arxiv_id': get_arxiv_id(record),
159
        'authors': get_authors(record),
160
        'collaborations': get_collaborations(record),
161
        'divulgation': get_divulgation(record),
162
        'doi': get_doi(record),
163
        'domains': get_domains(record),
164
        'inspire_id': get_inspire_id(record),
165
        'journal_issue': get_journal_issue(record),
166
        'journal_title': get_journal_title(record),
167
        'journal_volume': get_journal_volume(record),
168
        'keywords': get_keywords(record),
169
        'language': get_language(record),
170
        'page_artid': get_page_artid(record),
171
        'peer_reviewed': get_peer_reviewed(record),
172
        'publication_date': get_publication_date(record),
173
        'subtitle': get_subtitle(record),
174
        'title': get_title(record),
175
    }
176

177

178
def _is_preprint(record):
5✔
179
    document_types = get_document_types(record)
1✔
180

181
    return 'article' in document_types
1✔
182

183

184
def _get_preprint_context(record):
5✔
185
    abstract = get_abstract(record)
1✔
186
    try:
1✔
187
        abstract_language = detect(abstract)
1✔
188
    except LangDetectException:
×
189
        abstract_language = ''
×
190

191
    return {
1✔
192
        'abstract': abstract,
193
        'abstract_language': abstract_language,
194
        'arxiv_id': get_arxiv_id(record),
195
        'authors': get_authors(record),
196
        'collaborations': get_collaborations(record),
197
        'divulgation': get_divulgation(record),
198
        'domains': get_domains(record),
199
        'inspire_id': get_inspire_id(record),
200
        'keywords': get_keywords(record),
201
        'language': get_language(record),
202
        'subtitle': get_subtitle(record),
203
        'title': get_title(record),
204
    }
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc