• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

inspirehep / inspire-next / 12538

pending completion
12538

Pull #3458

travis-ci

web-flow
migrator view: filter out 'DELETED' invalid records

Do not return invalid records with collection 'DELETED' to the view.

Also, add a composite index on `valid` and `collection` columns of the
LegacyRecordsMirror table model and create an alembic recipe for it.

Signed-off-by: Iuliana Voinea <iuliana.voinea@student.manchester.ac.uk>
Pull Request #3458: migrator view: filter out 'DELETED' invalid records

2 of 2 new or added lines in 2 files covered. (100.0%)

6083 of 9390 relevant lines covered (64.78%)

1.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.66
/inspirehep/modules/workflows/tasks/arxiv.py
1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of INSPIRE.
4
# Copyright (C) 2014-2017 CERN.
5
#
6
# INSPIRE is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# INSPIRE is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
18
#
19
# In applying this license, CERN does not waive the privileges and immunities
20
# granted to it by virtue of its status as an Intergovernmental Organization
21
# or submit itself to any jurisdiction.
22

23
"""Tasks used in OAI harvesting for arXiv record manipulation."""
4✔
24

25
from __future__ import absolute_import, division, print_function
4✔
26

27
import os
4✔
28
import re
4✔
29
from functools import wraps
4✔
30

31
import backoff
4✔
32
import requests
4✔
33
from backports.tempfile import TemporaryDirectory
4✔
34
from flask import current_app
4✔
35
from lxml.etree import XMLSyntaxError
4✔
36
from timeout_decorator import timeout
4✔
37
from wand.exceptions import DelegateError
4✔
38
from werkzeug import secure_filename
4✔
39

40
from inspire_dojson import marcxml2record
4✔
41
from inspire_schemas.builders import LiteratureBuilder
4✔
42
from inspire_schemas.utils import classify_field
4✔
43
from plotextractor.api import process_tarball
4✔
44
from plotextractor.converter import untar
4✔
45
from plotextractor.errors import InvalidTarball, NoTexFilesFound
4✔
46

47
from inspirehep.utils.latex import decode_latex
4✔
48
from inspirehep.utils.record import get_arxiv_categories, get_arxiv_id
4✔
49
from inspirehep.utils.url import is_pdf_link, retrieve_uri
4✔
50
from inspirehep.modules.workflows.errors import DownloadError
4✔
51
from inspirehep.modules.workflows.utils import (
4✔
52
    convert,
53
    download_file_to_workflow,
54
    ignore_timeout_error,
55
    with_debug_logging,
56
)
57

58
REGEXP_AUTHLIST = re.compile(
4✔
59
    "<collaborationauthorlist.*?>.*?</collaborationauthorlist>", re.DOTALL)
60
REGEXP_REFS = re.compile(
4✔
61
    "<record.*?>.*?<controlfield .*?>.*?</controlfield>(.*?)</record>",
62
    re.DOTALL)
63
NO_PDF_ON_ARXIV = 'The author has provided no source to generate PDF, and no PDF.'
4✔
64

65

66
@with_debug_logging
4✔
67
@backoff.on_exception(backoff.expo, DownloadError, base=4, max_tries=5)
4✔
68
def populate_arxiv_document(obj, eng):
69
    arxiv_id = get_arxiv_id(obj.data)
1✔
70

71
    for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):
1✔
72
        url = current_app.config[conf_name].format(arxiv_id=arxiv_id)
1✔
73
        is_valid_pdf_link = is_pdf_link(url)
1✔
74
        if is_valid_pdf_link:
1✔
75
            break
1✔
76

77
        if NO_PDF_ON_ARXIV in requests.get(url).content:
×
78
            obj.log.info('No PDF is available for %s', arxiv_id)
×
79
            return
×
80

81
    if not is_valid_pdf_link:
1✔
82
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))
×
83

84
    filename = secure_filename('{0}.pdf'.format(arxiv_id))
1✔
85
    obj.data['documents'] = [
1✔
86
        document for document in obj.data.get('documents', ())
87
        if document.get('key') != filename
88
    ]
89

90
    lb = LiteratureBuilder(source='arxiv', record=obj.data)
1✔
91
    lb.add_document(
1✔
92
        filename,
93
        fulltext=True,
94
        hidden=True,
95
        material='preprint',
96
        original_url=url,
97
        url=url,
98
    )
99
    obj.data = lb.record
1✔
100

101

102
@with_debug_logging
4✔
103
def arxiv_package_download(obj, eng):
104
    """Perform the package download step for arXiv records.
105

106
    :param obj: Workflow Object to process
107
    :param eng: Workflow Engine processing the object
108
    """
109
    arxiv_id = get_arxiv_id(obj.data)
1✔
110
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
1✔
111
    tarball = download_file_to_workflow(
1✔
112
        workflow=obj,
113
        name=filename,
114
        url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id),
115
    )
116

117
    if tarball:
1✔
118
        obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id)
1✔
119
    else:
120
        obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
×
121

122

123
@ignore_timeout_error
4✔
124
@timeout(5 * 60)
4✔
125
@with_debug_logging
4✔
126
def arxiv_plot_extract(obj, eng):
127
    """Extract plots from an arXiv archive.
128

129
    :param obj: Workflow Object to process
130
    :param eng: Workflow Engine processing the object
131
    """
132
    arxiv_id = get_arxiv_id(obj.data)
1✔
133
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
1✔
134
    tarball = obj.files[filename]
1✔
135

136
    if tarball:
1✔
137
        with TemporaryDirectory(prefix='plot_extract') as scratch_space, \
1✔
138
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
139
            try:
1✔
140
                plots = process_tarball(
1✔
141
                    tarball_file,
142
                    output_directory=scratch_space,
143
                )
144
            except (InvalidTarball, NoTexFilesFound):
×
145
                obj.log.info(
×
146
                    'Invalid tarball %s for arxiv_id %s',
147
                    tarball.file.uri,
148
                    arxiv_id,
149
                )
150
                return
×
151
            except DelegateError as err:
×
152
                obj.log.error(
×
153
                    'Error extracting plots for %s. Report and skip.',
154
                    arxiv_id,
155
                )
156
                current_app.logger.exception(err)
×
157
                return
×
158

159
            if 'figures' in obj.data:
1✔
160
                for figure in obj.data['figures']:
×
161
                    if figure['key'] in obj.files:
×
162
                        del obj.files[figure['key']]
×
163
                del obj.data['figures']
×
164

165
            lb = LiteratureBuilder(source='arxiv', record=obj.data)
1✔
166
            for index, plot in enumerate(plots):
1✔
167
                plot_name = os.path.basename(plot.get('url'))
1✔
168
                key = plot_name
1✔
169
                if plot_name in obj.files.keys:
1✔
170
                    key = 'w{number}_{name}'.format(
×
171
                        number=index,
172
                        name=plot_name,
173
                    )
174
                with open(plot.get('url')) as plot_file:
1✔
175
                    obj.files[key] = plot_file
1✔
176

177
                lb.add_figure(
1✔
178
                    key=key,
179
                    caption=''.join(plot.get('captions', [])),
180
                    label=plot.get('label'),
181
                    material='preprint',
182
                    url='/api/files/{bucket}/{key}'.format(
183
                        bucket=obj.files[key].bucket_id,
184
                        key=key,
185
                    )
186
                )
187

188
            obj.data = lb.record
1✔
189
            obj.log.info('Added {0} plots.'.format(len(plots)))
1✔
190

191

192
@with_debug_logging
4✔
193
def arxiv_derive_inspire_categories(obj, eng):
194
    """Derive ``inspire_categories`` from the arXiv categories.
195

196
    Uses side effects to populate the ``inspire_categories`` key
197
    in ``obj.data`` by converting its arXiv categories.
198

199
    Args:
200
        obj (WorkflowObject): a workflow object.
201
        eng (WorkflowEngine): a workflow engine.
202

203
    Returns:
204
        None
205

206
    """
207
    obj.data.setdefault('inspire_categories', [])
1✔
208

209
    for arxiv_category in get_arxiv_categories(obj.data):
1✔
210
        term = classify_field(arxiv_category)
1✔
211
        if term:
1✔
212
            inspire_category = {
1✔
213
                'source': 'arxiv',
214
                'term': term,
215
            }
216

217
            if inspire_category not in obj.data['inspire_categories']:
1✔
218
                obj.data['inspire_categories'].append(inspire_category)
1✔
219

220

221
def arxiv_author_list(stylesheet="authorlist2marcxml.xsl"):
4✔
222
    """Extract authors from any author XML found in the arXiv archive.
223

224
    :param obj: Workflow Object to process
225
    :param eng: Workflow Engine processing the object
226
    """
227
    @with_debug_logging
4✔
228
    @wraps(arxiv_author_list)
4✔
229
    def _author_list(obj, eng):
230
        arxiv_id = get_arxiv_id(obj.data)
1✔
231
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
1✔
232
        tarball = obj.files[filename]
1✔
233

234
        if not tarball:
1✔
235
            obj.log.info(
×
236
                'Skipping author list extraction, no tarball with name "%s" found' % filename
237
            )
238
            return
×
239

240
        with TemporaryDirectory(prefix='author_list') as scratch_space, \
1✔
241
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
242
            try:
1✔
243
                file_list = untar(tarball_file, scratch_space)
1✔
244
            except InvalidTarball:
×
245
                obj.log.info(
×
246
                    'Invalid tarball %s for arxiv_id %s',
247
                    tarball.file.uri,
248
                    arxiv_id,
249
                )
250
                return
×
251

252
            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
1✔
253
            xml_files_list = [path for path in file_list if path.endswith('.xml')]
1✔
254
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))
1✔
255

256
            extracted_authors = []
1✔
257
            for xml_file in xml_files_list:
1✔
258
                with open(xml_file, 'r') as xml_file_fd:
×
259
                    xml_content = xml_file_fd.read()
×
260

261
                match = REGEXP_AUTHLIST.findall(xml_content)
×
262
                if match:
×
263
                    obj.log.info('Found a match for author extraction')
×
264
                    try:
×
265
                        authors_xml = convert(xml_content, stylesheet)
×
266
                    except XMLSyntaxError:
×
267
                        # Probably the %auto-ignore comment exists, so we skip the
268
                        # first line. See: inspirehep/inspire-next/issues/2195
269
                        authors_xml = convert(
×
270
                            xml_content.split('\n', 1)[1],
271
                            stylesheet,
272
                        )
273

274
                    extracted_authors.extend(marcxml2record(authors_xml).get('authors', []))
×
275

276
            if extracted_authors:
1✔
277
                for author in extracted_authors:
×
278
                    author['full_name'] = decode_latex(author['full_name'])
×
279

280
                obj.data['authors'] = extracted_authors
×
281

282
    return _author_list
4✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2024 Coveralls, Inc