12538

pending completion

Build # 12538

Build Type

Pull #3458

travis-ci

Committed by

web-flow

Commit Message

migrator view: filter out 'DELETED' invalid records

Do not return invalid records with collection 'DELETED' to the view.

Also, add a composite index on `valid` and `collection` columns of the
LegacyRecordsMirror table model and create an alembic recipe for it.

Signed-off-by: Iuliana Voinea <iuliana.voinea@student.manchester.ac.uk>

Pull Request Pull Request #3458: migrator view: filter out 'DELETED' invalid records

Run Details

2 of 2 new or added lines in 2 files covered. (100.0%)

6083 of 9390 relevant lines covered (64.78%)

1.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.66

/inspirehep/modules/workflows/tasks/arxiv.py

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Tasks used in OAI harvesting for arXiv record manipulation."""

from __future__ import absolute_import, division, print_function

import os
import re
from functools import wraps

import backoff
import requests
from backports.tempfile import TemporaryDirectory
from flask import current_app
from lxml.etree import XMLSyntaxError
from timeout_decorator import timeout
from wand.exceptions import DelegateError
from werkzeug import secure_filename

from inspire_dojson import marcxml2record
from inspire_schemas.builders import LiteratureBuilder
from inspire_schemas.utils import classify_field
from plotextractor.api import process_tarball
from plotextractor.converter import untar
from plotextractor.errors import InvalidTarball, NoTexFilesFound

from inspirehep.utils.latex import decode_latex
from inspirehep.utils.record import get_arxiv_categories, get_arxiv_id
from inspirehep.utils.url import is_pdf_link, retrieve_uri
from inspirehep.modules.workflows.errors import DownloadError
from inspirehep.modules.workflows.utils import (
    convert,
    download_file_to_workflow,
    ignore_timeout_error,
    with_debug_logging,
)

REGEXP_AUTHLIST = re.compile(
    "<collaborationauthorlist.*?>.*?</collaborationauthorlist>", re.DOTALL)
REGEXP_REFS = re.compile(
    "<record.*?>.*?<controlfield .*?>.*?</controlfield>(.*?)</record>",
    re.DOTALL)
NO_PDF_ON_ARXIV = 'The author has provided no source to generate PDF, and no PDF.'


@with_debug_logging
@backoff.on_exception(backoff.expo, DownloadError, base=4, max_tries=5)
def populate_arxiv_document(obj, eng):
    arxiv_id = get_arxiv_id(obj.data)

    for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):
        url = current_app.config[conf_name].format(arxiv_id=arxiv_id)
        is_valid_pdf_link = is_pdf_link(url)
        if is_valid_pdf_link:
            break

        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return

    if not is_valid_pdf_link:
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record


@with_debug_logging
def arxiv_package_download(obj, eng):
    """Perform the package download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = download_file_to_workflow(
        workflow=obj,
        name=filename,
        url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id),
    )

    if tarball:
        obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id)
    else:
        obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)


@ignore_timeout_error
@timeout(5 * 60)
@with_debug_logging
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = obj.files[filename]

    if tarball:
        with TemporaryDirectory(prefix='plot_extract') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                plots = process_tarball(
                    tarball_file,
                    output_directory=scratch_space,
                )
            except (InvalidTarball, NoTexFilesFound):
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return
            except DelegateError as err:
                obj.log.error(
                    'Error extracting plots for %s. Report and skip.',
                    arxiv_id,
                )
                current_app.logger.exception(err)
                return

            if 'figures' in obj.data:
                for figure in obj.data['figures']:
                    if figure['key'] in obj.files:
                        del obj.files[figure['key']]
                del obj.data['figures']

            lb = LiteratureBuilder(source='arxiv', record=obj.data)
            for index, plot in enumerate(plots):
                plot_name = os.path.basename(plot.get('url'))
                key = plot_name
                if plot_name in obj.files.keys:
                    key = 'w{number}_{name}'.format(
                        number=index,
                        name=plot_name,
                    )
                with open(plot.get('url')) as plot_file:
                    obj.files[key] = plot_file

                lb.add_figure(
                    key=key,
                    caption=''.join(plot.get('captions', [])),
                    label=plot.get('label'),
                    material='preprint',
                    url='/api/files/{bucket}/{key}'.format(
                        bucket=obj.files[key].bucket_id,
                        key=key,
                    )
                )

            obj.data = lb.record
            obj.log.info('Added {0} plots.'.format(len(plots)))


@with_debug_logging
def arxiv_derive_inspire_categories(obj, eng):
    """Derive ``inspire_categories`` from the arXiv categories.

    Uses side effects to populate the ``inspire_categories`` key
    in ``obj.data`` by converting its arXiv categories.

    Args:
        obj (WorkflowObject): a workflow object.
        eng (WorkflowEngine): a workflow engine.

    Returns:
        None

    """
    obj.data.setdefault('inspire_categories', [])

    for arxiv_category in get_arxiv_categories(obj.data):
        term = classify_field(arxiv_category)
        if term:
            inspire_category = {
                'source': 'arxiv',
                'term': term,
            }

            if inspire_category not in obj.data['inspire_categories']:
                obj.data['inspire_categories'].append(inspire_category)


def arxiv_author_list(stylesheet="authorlist2marcxml.xsl"):
    """Extract authors from any author XML found in the arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    @with_debug_logging
    @wraps(arxiv_author_list)
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        tarball = obj.files[filename]

        if not tarball:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found' % filename
            )
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [path for path in file_list if path.endswith('.xml')]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors

    return _author_list

1	# -- coding: utf-8 --
2	#
3	# This file is part of INSPIRE.
4	# Copyright (C) 2014-2017 CERN.
5	#
6	# INSPIRE is free software: you can redistribute it and/or modify
7	# it under the terms of the GNU General Public License as published by
8	# the Free Software Foundation, either version 3 of the License, or
9	# (at your option) any later version.
10	#
11	# INSPIRE is distributed in the hope that it will be useful,
12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	# GNU General Public License for more details.
15	#
16	# You should have received a copy of the GNU General Public License
17	# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
18	#
19	# In applying this license, CERN does not waive the privileges and immunities
20	# granted to it by virtue of its status as an Intergovernmental Organization
21	# or submit itself to any jurisdiction.
22
23	"""Tasks used in OAI harvesting for arXiv record manipulation."""	4✔
24
25	from __future__ import absolute_import, division, print_function	4✔
26
27	import os	4✔
28	import re	4✔
29	from functools import wraps	4✔
30
31	import backoff	4✔
32	import requests	4✔
33	from backports.tempfile import TemporaryDirectory	4✔
34	from flask import current_app	4✔
35	from lxml.etree import XMLSyntaxError	4✔
36	from timeout_decorator import timeout	4✔
37	from wand.exceptions import DelegateError	4✔
38	from werkzeug import secure_filename	4✔
39
40	from inspire_dojson import marcxml2record	4✔
41	from inspire_schemas.builders import LiteratureBuilder	4✔
42	from inspire_schemas.utils import classify_field	4✔
43	from plotextractor.api import process_tarball	4✔
44	from plotextractor.converter import untar	4✔
45	from plotextractor.errors import InvalidTarball, NoTexFilesFound	4✔
46
47	from inspirehep.utils.latex import decode_latex	4✔
48	from inspirehep.utils.record import get_arxiv_categories, get_arxiv_id	4✔
49	from inspirehep.utils.url import is_pdf_link, retrieve_uri	4✔
50	from inspirehep.modules.workflows.errors import DownloadError	4✔
51	from inspirehep.modules.workflows.utils import (	4✔
52	convert,
53	download_file_to_workflow,
54	ignore_timeout_error,
55	with_debug_logging,
56	)
57
58	REGEXP_AUTHLIST = re.compile(	4✔
59	"<collaborationauthorlist.?>.?</collaborationauthorlist>", re.DOTALL)
60	REGEXP_REFS = re.compile(	4✔
61	"<record.?>.?<controlfield .?>.?</controlfield>(.*?)</record>",
62	re.DOTALL)
63	NO_PDF_ON_ARXIV = 'The author has provided no source to generate PDF, and no PDF.'	4✔
64
65
66	@with_debug_logging	4✔
67	@backoff.on_exception(backoff.expo, DownloadError, base=4, max_tries=5)	4✔
68	def populate_arxiv_document(obj, eng):
69	arxiv_id = get_arxiv_id(obj.data)	1✔
70
71	for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):	1✔
72	url = current_app.config[conf_name].format(arxiv_id=arxiv_id)	1✔
73	is_valid_pdf_link = is_pdf_link(url)	1✔
74	if is_valid_pdf_link:	1✔
75	break	1✔
76
77	if NO_PDF_ON_ARXIV in requests.get(url).content:	×
78	obj.log.info('No PDF is available for %s', arxiv_id)	×
79	return	×
80
81	if not is_valid_pdf_link:	1✔
82	raise DownloadError("{url} is not serving a PDF file.".format(url=url))	×
83
84	filename = secure_filename('{0}.pdf'.format(arxiv_id))	1✔
85	obj.data['documents'] = [	1✔
86	document for document in obj.data.get('documents', ())
87	if document.get('key') != filename
88	]
89
90	lb = LiteratureBuilder(source='arxiv', record=obj.data)	1✔
91	lb.add_document(	1✔
92	filename,
93	fulltext=True,
94	hidden=True,
95	material='preprint',
96	original_url=url,
97	url=url,
98	)
99	obj.data = lb.record	1✔
100
101
102	@with_debug_logging	4✔
103	def arxiv_package_download(obj, eng):
104	"""Perform the package download step for arXiv records.
105
106	:param obj: Workflow Object to process
107	:param eng: Workflow Engine processing the object
108	"""
109	arxiv_id = get_arxiv_id(obj.data)	1✔
110	filename = secure_filename('{0}.tar.gz'.format(arxiv_id))	1✔
111	tarball = download_file_to_workflow(	1✔
112	workflow=obj,
113	name=filename,
114	url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id),
115	)
116
117	if tarball:	1✔
118	obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id)	1✔
119	else:
120	obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)	×
121
122
123	@ignore_timeout_error	4✔
124	@timeout(5 * 60)	4✔
125	@with_debug_logging	4✔
126	def arxiv_plot_extract(obj, eng):
127	"""Extract plots from an arXiv archive.
128
129	:param obj: Workflow Object to process
130	:param eng: Workflow Engine processing the object
131	"""
132	arxiv_id = get_arxiv_id(obj.data)	1✔
133	filename = secure_filename('{0}.tar.gz'.format(arxiv_id))	1✔
134	tarball = obj.files[filename]	1✔
135
136	if tarball:	1✔
137	with TemporaryDirectory(prefix='plot_extract') as scratch_space, \	1✔
138	retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
139	try:	1✔
140	plots = process_tarball(	1✔
141	tarball_file,
142	output_directory=scratch_space,
143	)
144	except (InvalidTarball, NoTexFilesFound):	×
145	obj.log.info(	×
146	'Invalid tarball %s for arxiv_id %s',
147	tarball.file.uri,
148	arxiv_id,
149	)
150	return	×
151	except DelegateError as err:	×
152	obj.log.error(	×
153	'Error extracting plots for %s. Report and skip.',
154	arxiv_id,
155	)
156	current_app.logger.exception(err)	×
157	return	×
158
159	if 'figures' in obj.data:	1✔
160	for figure in obj.data['figures']:	×
161	if figure['key'] in obj.files:	×
162	del obj.files[figure['key']]	×
163	del obj.data['figures']	×
164
165	lb = LiteratureBuilder(source='arxiv', record=obj.data)	1✔
166	for index, plot in enumerate(plots):	1✔
167	plot_name = os.path.basename(plot.get('url'))	1✔
168	key = plot_name	1✔
169	if plot_name in obj.files.keys:	1✔
170	key = 'w{number}_{name}'.format(	×
171	number=index,
172	name=plot_name,
173	)
174	with open(plot.get('url')) as plot_file:	1✔
175	obj.files[key] = plot_file	1✔
176
177	lb.add_figure(	1✔
178	key=key,
179	caption=''.join(plot.get('captions', [])),
180	label=plot.get('label'),
181	material='preprint',
182	url='/api/files/{bucket}/{key}'.format(
183	bucket=obj.files[key].bucket_id,
184	key=key,
185	)
186	)
187
188	obj.data = lb.record	1✔
189	obj.log.info('Added {0} plots.'.format(len(plots)))	1✔
190
191
192	@with_debug_logging	4✔
193	def arxiv_derive_inspire_categories(obj, eng):
194	"""Derive ``inspire_categories`` from the arXiv categories.
195
196	Uses side effects to populate the ``inspire_categories`` key
197	in ``obj.data`` by converting its arXiv categories.
198
199	Args:
200	obj (WorkflowObject): a workflow object.
201	eng (WorkflowEngine): a workflow engine.
202
203	Returns:
204	None
205
206	"""
207	obj.data.setdefault('inspire_categories', [])	1✔
208
209	for arxiv_category in get_arxiv_categories(obj.data):	1✔
210	term = classify_field(arxiv_category)	1✔
211	if term:	1✔
212	inspire_category = {	1✔
213	'source': 'arxiv',
214	'term': term,
215	}
216
217	if inspire_category not in obj.data['inspire_categories']:	1✔
218	obj.data['inspire_categories'].append(inspire_category)	1✔
219
220
221	def arxiv_author_list(stylesheet="authorlist2marcxml.xsl"):	4✔
222	"""Extract authors from any author XML found in the arXiv archive.
223
224	:param obj: Workflow Object to process
225	:param eng: Workflow Engine processing the object
226	"""
227	@with_debug_logging	4✔
228	@wraps(arxiv_author_list)	4✔
229	def _author_list(obj, eng):
230	arxiv_id = get_arxiv_id(obj.data)	1✔
231	filename = secure_filename('{0}.tar.gz'.format(arxiv_id))	1✔
232	tarball = obj.files[filename]	1✔
233
234	if not tarball:	1✔
235	obj.log.info(	×
236	'Skipping author list extraction, no tarball with name "%s" found' % filename
237	)
238	return	×
239
240	with TemporaryDirectory(prefix='author_list') as scratch_space, \	1✔
241	retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
242	try:	1✔
243	file_list = untar(tarball_file, scratch_space)	1✔
244	except InvalidTarball:	×
245	obj.log.info(	×
246	'Invalid tarball %s for arxiv_id %s',
247	tarball.file.uri,
248	arxiv_id,
249	)
250	return	×
251
252	obj.log.info('Extracted tarball to: {0}'.format(scratch_space))	1✔
253	xml_files_list = [path for path in file_list if path.endswith('.xml')]	1✔
254	obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))	1✔
255
256	extracted_authors = []	1✔
257	for xml_file in xml_files_list:	1✔
258	with open(xml_file, 'r') as xml_file_fd:	×
259	xml_content = xml_file_fd.read()	×
260
261	match = REGEXP_AUTHLIST.findall(xml_content)	×
262	if match:	×
263	obj.log.info('Found a match for author extraction')	×
264	try:	×
265	authors_xml = convert(xml_content, stylesheet)	×
266	except XMLSyntaxError:	×
267	# Probably the %auto-ignore comment exists, so we skip the
268	# first line. See: inspirehep/inspire-next/issues/2195
269	authors_xml = convert(	×
270	xml_content.split('\n', 1)[1],
271	stylesheet,
272	)
273
274	extracted_authors.extend(marcxml2record(authors_xml).get('authors', []))	×
275
276	if extracted_authors:	1✔
277	for author in extracted_authors:	×
278	author['full_name'] = decode_latex(author['full_name'])	×
279
280	obj.data['authors'] = extracted_authors	×
281
282	return _author_list	4✔

inspirehep / inspire-next / 12538

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous