16028315195

Committed 02 Jul 2025 02:44PM UTC coverage: 83.647%. Remained the same

Build # 16028315195

Build Type

Pull #888

github

Committed by

GraemeWatt

Commit Message

tests: switch from lxml to xmlschema

* Modify test_xml_validates to avoid problems with lxml v6.0.0.
* Closes #887.

Pull Request Pull Request #888: Fix `test_xml_validates` by switching from `lxml` to `xmlschema`

Run Details

0 of 1 new or added line in 1 file covered. (0.0%)

8 existing lines in 1 file now uncovered.

4578 of 5473 relevant lines covered (83.65%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.77

/hepdata/modules/records/api.py

# -*- coding: utf-8 -*-
#
# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""API for HEPData-Records."""
import os
from collections import OrderedDict
from functools import wraps
import mimetypes
import time

from celery import shared_task
from flask import redirect, request, render_template, jsonify, current_app, Response, abort, flash
from flask_login import current_user
from invenio_accounts.models import User
from invenio_db import db
from sqlalchemy import and_, func
from sqlalchemy.orm.exc import NoResultFound
from werkzeug.utils import secure_filename

from hepdata.modules.converter import convert_oldhepdata_to_yaml
from hepdata.modules.email.api import send_cookie_email
from hepdata.modules.email.utils import create_send_email_task
from hepdata.modules.permissions.api import user_allowed_to_perform_action
from hepdata.modules.permissions.models import SubmissionParticipant
from hepdata.modules.records.subscribers.api import is_current_user_subscribed_to_record
from hepdata.modules.records.utils.common import decode_string, find_file_in_directory, allowed_file, \
    remove_file_extension, truncate_string, get_record_contents, get_record_by_id, IMAGE_TYPES
from hepdata.modules.records.utils.data_processing_utils import process_ctx
from hepdata.modules.records.utils.data_files import get_data_path_for_record, cleanup_old_files
from hepdata.modules.records.utils.json_ld import get_json_ld
from hepdata.modules.records.utils.submission import process_submission_directory, \
    create_data_review, cleanup_submission, clean_error_message_for_display
from hepdata.modules.submission.api import get_latest_hepsubmission, get_submission_participants_for_record
from hepdata.modules.records.utils.users import get_coordinators_in_system, has_role
from hepdata.modules.records.utils.workflow import update_action_for_submission_participant
from hepdata.modules.records.utils.yaml_utils import split_files
from hepdata.modules.stats.views import increment, get_count
from hepdata.modules.submission.models import (
    DataReview,
    DataSubmission,
    HEPSubmission,
    RecordVersionCommitMessage,
    RelatedRecid,
    RelatedTable
)
from hepdata.utils.file_extractor import extract
from hepdata.utils.miscellaneous import sanitize_html, get_resource_data
from hepdata.utils.users import get_user_from_id
from bs4 import BeautifulSoup
from hepdata_converter_ws_client import Error

import tempfile
import shutil

import logging
logging.basicConfig()
log = logging.getLogger(__name__)

RECORD_PLAIN_TEXT = {
    "passed": "passed review",
    "attention": "attention required",
    "todo": "to be reviewed"
}

JSON_LD_MIMETYPES = [
    'application/ld+json',
    'application/vnd.hepdata.ld+json'
]

def returns_json(f):
    @wraps(f)
    def decorated_function(*args, **kwargs):
        r = f(*args, **kwargs)
        return Response(r, content_type='application/json; charset=utf-8')

    return decorated_function


def format_submission(recid, record, version, version_count, hepdata_submission,
                      data_table=None):
    """
    Performs all the processing of the record to be displayed.

    :param recid:
    :param record:
    :param version:
    :param version_count:
    :param hepdata_submission:
    :param data_table:
    :return:
    """
    ctx = {}
    if hepdata_submission is not None:

        ctx['site_url'] = current_app.config.get('SITE_URL', 'https://www.hepdata.net')
        ctx['record'] = record
        ctx["version_count"] = version_count

        if version != -1:
            ctx["version"] = version
        else:
            # we get the latest version by default
            ctx["version"] = version_count

        if record is not None:
            if "collaborations" in record and type(record['collaborations']) is not list:
                collaborations = [x.strip() for x in record["collaborations"].split(",")]
                ctx['record']['collaborations'] = collaborations

            authors = record.get('authors', None)

            create_breadcrumb_text(authors, ctx, record)
            get_commit_message(ctx, recid)

            if authors:
                truncate_author_list(record)

            determine_user_privileges(recid, ctx)

        else:
            ctx['record'] = {}
            determine_user_privileges(recid, ctx)
            ctx['show_upload_widget'] = True
            ctx['show_review_widget'] = False

        ctx['participant_count'] = SubmissionParticipant.query \
            .filter_by(publication_recid=recid, status="primary") \
            .filter(SubmissionParticipant.role.in_(["reviewer", "uploader"])) \
            .count()
        ctx['reviewers_notified'] = hepdata_submission.reviewers_notified

        ctx['record']['last_updated'] = hepdata_submission.last_updated
        ctx['record']['hepdata_doi'] = "{0}".format(hepdata_submission.doi)
        if hepdata_submission.doi:
            ctx['record']['hepdata_doi'] += ".v{0}".format(ctx['version'])

        ctx['recid'] = recid
        ctx["status"] = hepdata_submission.overall_status
        ctx['record']['data_abstract'] = sanitize_html(decode_string(hepdata_submission.data_abstract))

        extract_journal_info(record)

        if hepdata_submission.overall_status != 'finished' and ctx["version_count"] > 0:
            if not (ctx['show_review_widget']
                    or ctx['show_upload_widget']
                    or ctx['is_submission_coordinator_or_admin']):
                # we show the latest approved version.
                ctx["version"] -= 1
                ctx["version_count"] -= 1

        ctx['additional_resources'] = submission_has_resources(hepdata_submission)
        ctx['resources_with_doi'] = []
        for resource in hepdata_submission.resources:
            if resource.doi:
                ctx['resources_with_doi'].append({
                    'filename': os.path.basename(resource.file_location),
                    'description': resource.file_description,
                    'doi': resource.doi
                })

        # query for a related data submission
        data_record_query = DataSubmission.query.filter_by(
            publication_recid=recid,
            version=ctx["version"]).order_by(DataSubmission.id.asc())

        format_tables(ctx, data_record_query, data_table, recid)

        ctx['access_count'] = get_count(recid)
        ctx['mode'] = 'record'
        ctx['coordinator'] = hepdata_submission.coordinator
        ctx['coordinators'] = get_coordinators_in_system()
        ctx['record'].pop('authors', None)

    return ctx


def format_tables(ctx, data_record_query, data_table, recid):
    """
    Finds all the tables related to a submission and formats
    them for display in the UI or as JSON.

    :return:
    """
    first_data_id = -1
    data_table_metadata, first_data_id = process_data_tables(
        ctx, data_record_query, first_data_id, data_table)
    assign_or_create_review_status(data_table_metadata, recid, ctx["version"])
    ctx['watched'] = is_current_user_subscribed_to_record(recid)
    ctx['data_tables'] = list(data_table_metadata.values())
    ctx['table_id_to_show'] = first_data_id
    ctx['table_name_to_show'] = ''
    matching_tables = list(filter(
        lambda data_table: data_table['id'] == first_data_id,
        ctx['data_tables']))
    if matching_tables:
        ctx['table_name_to_show'] = matching_tables[0]['name']
    if 'table' in request.args:
        if request.args['table']:
            table_from_args = request.args['table']
            # Check for table name in list of data tables.
            matching_tables = list(filter(
                lambda data_table: data_table['name'] == table_from_args,
                ctx['data_tables']))
            if not matching_tables:
                # Check for processed table name in list of data tables.
                matching_tables = list(filter(
                    lambda data_table: data_table['processed_name'] == table_from_args,
                    ctx['data_tables']))
            if matching_tables:
                # Set table ID and name to the first matching table.
                ctx['table_id_to_show'] = matching_tables[0]['id']
                ctx['table_name_to_show'] = matching_tables[0]['name']


def format_resource(resource, contents, content_url):
    """
    Gets info about a resource ready to be displayed on the resource's
    landing page

    :param resource: DataResource object to be displayed
    :param contents: Resource file contents

    :return: context dictionary ready for the template
    """
    hepsubmission = HEPSubmission.query.filter(HEPSubmission.resources.any(id=resource.id)).first()
    if not hepsubmission:
        datasubmission = DataSubmission.query.filter(DataSubmission.resources.any(id=resource.id)).first()
        if datasubmission:
            hepsubmission = HEPSubmission.query.filter_by(
                publication_recid=datasubmission.publication_recid,
                version=datasubmission.version
            ).first()
        if not hepsubmission:
            # Look for DataSubmission mapping to this resource
            raise ValueError("Unable to find publication for resource %d. (Is it a data file?)", resource.id)

    record = get_record_contents(hepsubmission.publication_recid)
    ctx = format_submission(hepsubmission.publication_recid, record,
                            hepsubmission.version, 1, hepsubmission)
    ctx['record_type'] = 'resource'
    ctx['resource'] = resource
    ctx['contents'] = contents
    ctx['content_url'] = content_url
    ctx['resource_url'] = request.url
    ctx['related_publication_id'] = hepsubmission.publication_recid
    ctx['file_mimetype'] = get_resource_mimetype(resource, contents)
    ctx['resource_filename'] = os.path.basename(resource.file_location)
    ctx['resource_filetype'] = f'{resource.file_type} File'
    ctx['related_recids'] = get_record_data_list(hepsubmission, "related")
    ctx['related_to_this_recids'] = get_record_data_list(hepsubmission, "related_to_this")

    if resource.file_type in IMAGE_TYPES:
        ctx['display_type'] = 'image'
    elif resource.file_location.lower().startswith('http'):
        ctx['display_type'] = 'link'
        ctx['resource_filename'] = 'External Link'
        ctx['resource_filetype'] = 'External Link'
    elif contents == 'Binary':
        ctx['display_type'] = 'binary'
    else:
        ctx['display_type'] = 'code'

    ctx['json_ld'] = get_json_ld(
        ctx,
        hepsubmission.overall_status
    )

    return ctx


def get_resource_mimetype(resource, contents):
    file_mimetype = mimetypes.guess_type(resource.file_location)[0]
    if file_mimetype is None:
        if contents == 'Binary':
            file_mimetype = 'application/octet-stream'
        else:
            file_mimetype = 'text/plain'
    return file_mimetype


def should_send_json_ld(request):
    """Determine whether to send JSON-LD instead of HTML for this request

    :param type request: flask.Request object
    :return: True if request accepts JSON-LD; False otherwise
    :rtype: bool

    """
    # Determine whether to send JSON-LD
    return any([request.accept_mimetypes.quality(m) >= 1 for m in JSON_LD_MIMETYPES])


def get_commit_message(ctx, recid):
    """
    Returns a commit message for the current version if present.
    Will return the highest ID of a version-recid pairing.

    :param ctx:
    :param recid:
    """
    try:
        # Select the most recent commit (greatest ID)
        commit_message_query = RecordVersionCommitMessage.query \
            .filter_by(version=ctx["version"], recid=recid) \
            .order_by(RecordVersionCommitMessage.id.desc())

        if commit_message_query.count() > 0:
            commit_message = commit_message_query.first()
            ctx["revision_message"] = {
                'version': commit_message.version,
                'message': commit_message.message}

    except NoResultFound:
        pass


def create_breadcrumb_text(authors, ctx, record):
    """Creates the breadcrumb text for a submission."""
    if "first_author" in record and 'full_name' in record["first_author"] \
            and record["first_author"]["full_name"] is not None:
        ctx['breadcrumb_text'] = record["first_author"]["full_name"]
    elif authors and authors[0] and 'full_name' in authors[0] \
            and authors[0]["full_name"] is not None:
        ctx['breadcrumb_text'] = authors[0]["full_name"]

    if authors is not None and len(authors) > 1:
        ctx['breadcrumb_text'] += " et al."


def submission_has_resources(hepsubmission):
    """
    Returns whether the submission has resources attached.

    :param hepsubmission: HEPSubmission object
    :return: bool
    """
    return len(hepsubmission.resources) > 0


def extract_journal_info(record):
    if record and 'type' in record:
        if 'thesis' in record['type']:
            if 'type' in record['dissertation']:
                record['journal_info'] = record['dissertation']['type'] + ", " + record['dissertation'][
                    'institution']
            else:
                record['journal_info'] = "PhD Thesis"
        elif 'conference paper' in record['type']:
            record['journal_info'] = "Conference Paper"


def render_record(recid, record, version, output_format, light_mode=False):

    # Count number of all versions and number of finished versions of a publication record.
    version_count_all = HEPSubmission.query.filter(HEPSubmission.publication_recid == recid,
                                                   and_(HEPSubmission.overall_status != 'sandbox',
                                                        HEPSubmission.overall_status != 'sandbox_processing')).count()
    version_count_finished = HEPSubmission.query.filter_by(publication_recid=recid, overall_status='finished').count()

    # Number of versions that a user is allowed to access based on their permissions.
    version_count = version_count_all if user_allowed_to_perform_action(recid) else version_count_finished

    # If version not given explicitly, take to be latest allowed version (or 1 if there are no allowed versions).
    if version == -1:
        version = version_count if version_count else 1

    # Check for a user trying to access a version of a publication record where they don't have permissions.
    if version_count < version_count_all and version == version_count_all:
        # Prompt the user to login if they are not authenticated then redirect, otherwise return a 403 error.
        if not current_user.is_authenticated:
            redirect_url_after_login = '%2Frecord%2F{0}%3Fversion%3D{1}%26format%3D{2}'.format(recid, version, output_format)
            if 'table' in request.args:
                redirect_url_after_login += '%26table%3D{0}'.format(request.args['table'])
            if output_format.startswith('yoda') and 'rivet' in request.args:
                redirect_url_after_login += '%26rivet%3D{0}'.format(request.args['rivet'])
            return redirect('/login/?next={0}'.format(redirect_url_after_login))
        else:
            abort(403)

    hepdata_submission = get_latest_hepsubmission(publication_recid=recid, version=version)

    if hepdata_submission is not None:
        if hepdata_submission.overall_status == 'processing':
            ctx = {'recid': recid}
            determine_user_privileges(recid, ctx)
            return render_template('hepdata_records/publication_processing.html', ctx=ctx)

        elif not hepdata_submission.overall_status.startswith('sandbox'):
            ctx = format_submission(recid, record, version, version_count, hepdata_submission)
            ctx['record_type'] = 'publication'
            ctx['related_recids'] = get_record_data_list(hepdata_submission, "related")
            ctx['related_to_this_recids'] = get_record_data_list(hepdata_submission, "related_to_this")

            increment(recid)

            if output_format == 'html' or output_format == 'json_ld':
                ctx['json_ld'] = get_json_ld(
                    ctx,
                    hepdata_submission.overall_status
                )

                if output_format == 'json_ld':
                    status_code = 404 if 'error' in ctx['json_ld'] else 200
                    return jsonify(ctx['json_ld']), status_code

                if output_format == 'html':
                    return render_template('hepdata_records/publication_record.html', ctx=ctx)

            elif 'table' not in request.args:
                if output_format == 'json':
                    ctx = process_ctx(ctx, light_mode)
                    return jsonify(ctx)
                elif output_format.startswith('yoda') and 'rivet' in request.args:
                    return redirect('/download/submission/{0}/{1}/{2}/{3}'.format(recid, version, output_format,
                                                                              request.args['rivet']))
                else:
                    return redirect('/download/submission/{0}/{1}/{2}'.format(recid, version, output_format))
            else:
                file_identifier = 'ins{}'.format(hepdata_submission.inspire_id) if hepdata_submission.inspire_id else recid
                if output_format.startswith('yoda') and 'rivet' in request.args:
                    return redirect('/download/table/{0}/{1}/{2}/{3}/{4}'.format(
                        file_identifier, request.args['table'].replace('%', '%25').replace('\\', '%5C'), version, output_format,
                        request.args['rivet']))
                else:
                    return redirect('/download/table/{0}/{1}/{2}/{3}'.format(
                        file_identifier, request.args['table'].replace('%', '%25').replace('\\', '%5C'), version, output_format))
        else:
            abort(404)

    elif record is not None:  # this happens when we access an id of a data record
        # in which case, we find the related publication, and
        # make the front end focus on the relevant data table.
        try:
            publication_recid = int(record['related_publication'])
            publication_record = get_record_contents(publication_recid)

            datasubmission = DataSubmission.query.filter_by(associated_recid=recid).one()
            hepdata_submission = get_latest_hepsubmission(publication_recid=publication_recid,
                                                          version=datasubmission.version)

            ctx = format_submission(publication_recid, publication_record,
                                    datasubmission.version, 1, hepdata_submission,
                                    data_table=record['title'])
            ctx['record_type'] = 'table'
            ctx['related_publication_id'] = publication_recid
            ctx['table_name'] = record['title']
            ctx['related_recids'] = get_record_data_list(hepdata_submission, "related")
            ctx['related_to_this_recids'] = get_record_data_list(hepdata_submission, "related_to_this")

            if output_format == 'html' or output_format == 'json_ld':
                ctx['json_ld'] = get_json_ld(
                    ctx,
                    hepdata_submission.overall_status,
                    datasubmission
                )

                if output_format == 'json_ld':
                    status_code = 404 if 'error' in ctx['json_ld'] else 200
                    return jsonify(ctx['json_ld']), status_code

                return render_template('hepdata_records/related_record.html', ctx=ctx)

            elif output_format.startswith('yoda') and 'rivet' in request.args:
                return redirect('/download/table/{0}/{1}/{2}/{3}/{4}'.format(
                    publication_recid, ctx['table_name'].replace('%', '%25').replace('\\', '%5C'), datasubmission.version, output_format,
                    request.args['rivet']))
            else:
                return redirect('/download/table/{0}/{1}/{2}/{3}'.format(
                    publication_recid, ctx['table_name'].replace('%', '%25').replace('\\', '%5C'), datasubmission.version, output_format))

        except Exception as e:
            abort(404)
    else:
        abort(404)


def has_upload_permissions(recid, user, is_sandbox=False):
    if has_role(user, 'admin'):
        return True

    if is_sandbox:
        hepsubmission_record = get_latest_hepsubmission(publication_recid=recid, overall_status='sandbox')
        return hepsubmission_record is not None and hepsubmission_record.coordinator == user.id

    participant = SubmissionParticipant.query.filter_by(user_account=user.id,
        role='uploader', publication_recid=recid, status='primary').first()
    if participant:
        return True

def has_coordinator_permissions(recid, user, is_sandbox=False):
    if has_role(user, 'admin'):
        return True

    coordinator_record = HEPSubmission.query.filter_by(
        publication_recid=recid,
        coordinator=user.get_id()).first()
    return coordinator_record is not None


def create_new_version(recid, user, notify_uploader=True, uploader_message=None):
    hepsubmission = get_latest_hepsubmission(publication_recid=recid)

    if hepsubmission.overall_status == 'finished':
        # Reopen the submission to allow for revisions,
        # by creating a new HEPSubmission object.
        _rev_hepsubmission = HEPSubmission(publication_recid=recid,
                                           overall_status='todo',
                                           inspire_id=hepsubmission.inspire_id,
                                           coordinator=hepsubmission.coordinator,
                                           version=hepsubmission.version + 1)
        db.session.add(_rev_hepsubmission)
        db.session.commit()

        if notify_uploader:
            uploaders = SubmissionParticipant.query.filter_by(
                role='uploader', publication_recid=recid, status='primary'
                )
            record_information = get_record_by_id(recid)
            for uploader in uploaders:
                send_cookie_email(uploader,
                                  record_information,
                                  message=uploader_message,
                                  version=_rev_hepsubmission.version)

        return jsonify({'success': True, 'version': _rev_hepsubmission.version})
    else:
        return jsonify({"message": f"Rec id {recid} is not finished so cannot create a new version"}), 400


def process_payload(recid, file, redirect_url, synchronous=False):
    """Process an uploaded file

    :param recid: int
        The id of the record to update
    :param file: file
        The file to process
    :param redirect_url: string
        Redirect URL to record, for use if the upload fails or in synchronous mode
    :param synchronous: bool
        Whether to process asynchronously via celery (default) or immediately (only recommended for tests)
    :return: JSONResponse either containing 'url' (for success cases) or
             'message' (for error cases, which will give a 400 error).
    """

    if file and (allowed_file(file.filename)):
        file_path = save_zip_file(file, recid)
        file_size = os.path.getsize(file_path)
        UPLOAD_MAX_SIZE = current_app.config.get('UPLOAD_MAX_SIZE', 52000000)
        if file_size > UPLOAD_MAX_SIZE:
            return jsonify({"message":
                "{} too large ({} bytes > {} bytes)".format(
                    file.filename, file_size, UPLOAD_MAX_SIZE)}), 413

        hepsubmission = get_latest_hepsubmission(publication_recid=recid)

        if hepsubmission.overall_status == 'finished':
            # If it is finished and we receive an update,
            # then we need to reopen the submission to allow for revisions,
            # by creating a new HEPSubmission object.
            _rev_hepsubmission = HEPSubmission(publication_recid=recid,
                                               overall_status='todo',
                                               inspire_id=hepsubmission.inspire_id,
                                               coordinator=hepsubmission.coordinator,
                                               version=hepsubmission.version + 1)
            db.session.add(_rev_hepsubmission)
            hepsubmission = _rev_hepsubmission

        previous_status = hepsubmission.overall_status
        hepsubmission.overall_status = 'sandbox_processing' if previous_status == 'sandbox' else 'processing'
        db.session.add(hepsubmission)
        db.session.commit()

        if synchronous:
            process_saved_file(file_path, recid, current_user.get_id(), redirect_url, previous_status)
        else:
            process_saved_file.delay(file_path, recid, current_user.get_id(), redirect_url, previous_status)
            flash('File saved. You will receive an email when the file has been processed.', 'info')

        return jsonify({'url': redirect_url.format(recid)})
    else:
        return jsonify({"message": "You must upload a .zip, .tar, .tar.gz or .tgz file" +
                        " (or a .oldhepdata or single .yaml or .yaml.gz file)."}), 400


@shared_task
def process_saved_file(file_path, recid, userid, redirect_url, previous_status):
    try:
        hepsubmission = get_latest_hepsubmission(publication_recid=recid)
        if hepsubmission.overall_status != 'processing' and hepsubmission.overall_status != 'sandbox_processing':
            log.error('Record {} is not in a processing state.'.format(recid))
            return

        errors = process_zip_archive(file_path, recid)

        uploader = User.query.get(userid)
        site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net')

        submission_participant = SubmissionParticipant.query.filter_by(
            publication_recid=recid, user_account=userid, role='uploader').first()
        if submission_participant:
            full_name = submission_participant.full_name
        else:
            full_name = uploader.email

        if errors:
            cleanup_submission(recid, hepsubmission.version, [])  # delete all tables if errors
            message_body = render_template('hepdata_theme/email/upload_errors.html',
                                           name=full_name,
                                           article=recid,
                                           redirect_url=redirect_url.format(recid),
                                           errors=errors,
                                           site_url=site_url)

            create_send_email_task(uploader.email,
                                   '[HEPData] Submission {0} upload failed'.format(recid),
                                   message_body)
        else:
            update_action_for_submission_participant(recid, userid, 'uploader')
            message_body = render_template('hepdata_theme/email/upload_complete.html',
                                           name=full_name,
                                           article=recid,
                                           link=redirect_url.format(recid),
                                           site_url=site_url,
                                           overall_status=hepsubmission.overall_status)

            create_send_email_task(uploader.email,
                                   '[HEPData] Submission {0} upload succeeded'.format(recid),
                                   message_body)

        # Reset the status of the submission back to the previous value.
        hepsubmission.overall_status = previous_status
        db.session.add(hepsubmission)
        db.session.commit()

        # Delete any previous upload folders relating to non-final versions
        # of this hepsubmission
        cleanup_old_files(hepsubmission)

    except Exception as e:
        # Reset the status and send error emails, unless we're working
        # asynchronously and celery is about to retry
        if not process_saved_file.request.id \
                or process_saved_file.request.retries >= process_saved_file.max_retries:
            try:
                cleanup_submission(recid, hepsubmission.version, [])
                errors = {
                    "Unexpected error": [{
                        "level": "error",
                        "message": "An unexpected error occurred: {}".format(e)
                    }]
                }
                uploader = User.query.get(userid)
                site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net')
                message_body = render_template('hepdata_theme/email/upload_errors.html',
                                               name=uploader.email,
                                               article=recid,
                                               redirect_url=redirect_url.format(recid),
                                               errors=errors,
                                               site_url=site_url)

                create_send_email_task(uploader.email,
                                       '[HEPData] Submission {0} upload failed'.format(recid),
                                       message_body)
                log.error("Final attempt of process_saved_file for recid %s failed. Resetting to previous status." % recid)

                # Reset the status of the submission back to the previous value.
                hepsubmission.overall_status = previous_status
                db.session.add(hepsubmission)
                db.session.commit()

            except Exception as ex:
                log.error("Exception while cleaning up: %s" % ex)

        else:
            log.debug("Celery will retry task, attempt %s" % process_saved_file.request.retries)
            raise e


def save_zip_file(file, id):
    filename = secure_filename(file.filename)
    time_stamp = str(int(round(time.time())))
    file_save_directory = get_data_path_for_record(str(id), time_stamp)

    if filename.endswith('.oldhepdata'):
        file_save_directory = os.path.join(file_save_directory, 'oldhepdata')

    if not os.path.exists(file_save_directory):
        os.makedirs(file_save_directory)
    file_path = os.path.join(file_save_directory, filename)

    print('Saving file to {}'.format(file_path))
    file.save(file_path)
    return file_path


def process_zip_archive(file_path, id, old_schema=False):
    (file_save_directory, filename) = os.path.split(file_path)

    if not filename.endswith('.oldhepdata'):
        file_save_directory = os.path.dirname(file_path)
        submission_path = os.path.join(file_save_directory, remove_file_extension(filename))
        submission_temp_path = tempfile.mkdtemp(dir=current_app.config["CFG_TMPDIR"])

        if filename.endswith('.yaml.gz'):
            print('Extracting: {} to {}'.format(file_path, file_path[:-3]))
            if not extract(file_path, file_path[:-3]):
                message = clean_error_message_for_display(
                    "{} is not a valid .gz file.".format(file_path),
                    file_save_directory
                )
                return {
                    "Archive file extractor": [{
                        "level": "error",
                        "message": message
                    }]
                }
            return process_zip_archive(file_path[:-3], id,
                                       old_schema=False)
        elif filename.endswith('.yaml'):
            # we split the singular yaml file and create a submission directory
            error, last_updated = split_files(file_path, submission_temp_path)
            if error:
                message = clean_error_message_for_display(
                    str(error),
                    file_save_directory
                )
                return {
                    "Single YAML file splitter": [{
                        "level": "error",
                        "message": message
                    }]
                }
        else:
            # we are dealing with a zip, tar, etc. so we extract the contents
            try:
                unzipped_path = extract(file_path, submission_temp_path)
            except Exception as e:
                # Log the exception and raise it so that celery can retry
                log.exception(f"Unable to extract file {file_path}")
                message = clean_error_message_for_display(
                    "Unable to extract file {}. Please check the file is a valid zip or tar archive file and try again later. Contact info@hepdata.net if problems persist.".format(file_path),
                    file_save_directory
                )
                raise ValueError(message) from e

            if not unzipped_path:
                message = clean_error_message_for_display(
                    "{} is not a valid zip or tar archive file.".format(file_path),
                    file_save_directory
                )
                return {
                    "Archive file extractor": [{
                        "level": "error", "message": message
                    }]
                }

        copy_errors = move_files(submission_temp_path, submission_path)
        if copy_errors:
            return copy_errors

        submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml")

        if not submission_found:
            return {
                "Archive file extractor": [{
                    "level": "error", "message": "No submission.yaml file has been found in the archive."
                }]
            }

        basepath, submission_file_path = submission_found

    else:
        file_dir = os.path.dirname(file_save_directory)
        time_stamp = os.path.split(file_dir)[1]
        result = check_and_convert_from_oldhepdata(os.path.dirname(file_save_directory), id, time_stamp)

        # Check for errors
        if type(result) == dict:
            return result
        else:
            basepath, submission_file_path = result

    return process_submission_directory(basepath, submission_file_path, id,
                                        old_schema=old_schema)


def check_and_convert_from_oldhepdata(input_directory, id, timestamp):
    """
    Check if the input directory contains a .oldhepdata file
    and convert it to YAML if it happens.
    """
    converted_path = get_data_path_for_record(str(id), timestamp, 'yaml')

    oldhepdata_found = find_file_in_directory(
        input_directory,
        lambda x: x.endswith('.oldhepdata'),
    )
    if not oldhepdata_found:
        return {
            "Converter": [{
                "level": "error",
                "message": "No file with .oldhepdata extension has been found."
            }]
        }

    converted_temp_dir = tempfile.mkdtemp(dir=current_app.config["CFG_TMPDIR"])
    converted_temp_path = os.path.join(converted_temp_dir, 'yaml')

    try:
        successful = convert_oldhepdata_to_yaml(oldhepdata_found[1], converted_temp_path)
        if not successful:
            # Parse error message from title of HTML file, removing part of string after final "//".
            soup = BeautifulSoup(open(converted_temp_path), "html.parser")
            errormsg = soup.title.string.rsplit("//", 1)[0]

    except Error as error:  # hepdata_converter_ws_client.Error
        successful = False
        errormsg = str(error)

    if not successful:
        shutil.rmtree(converted_temp_dir, ignore_errors=True)  # can uncomment when this is definitely working

        return {
            "Converter": [{
                "level": "error",
                "message": "The conversion from oldhepdata "
                           "to the YAML format has not succeeded. "
                           "Error message from converter follows:<br/><br/>" + errormsg
            }]
        }
    else:
        copy_errors = move_files(converted_temp_path, converted_path)
        if copy_errors:
            return copy_errors

    return find_file_in_directory(converted_path, lambda x: x == "submission.yaml")


def move_files(submission_temp_path, submission_path):
    print('Copying files from {} to {}'.format(submission_temp_path + '/.', submission_path))
    try:
        shutil.rmtree(submission_path, ignore_errors=True)
        shutil.copytree(submission_temp_path, submission_path, symlinks=False)
    except shutil.Error as e:
        errors = []
        for srcname, dstname, exception in e.args[0]:
            # Remove full paths from filenames before sending error message to user
            filename = srcname.replace(submission_temp_path + '/', '')
            msg = str(exception).replace(submission_temp_path + '/', '').replace(submission_path + '/', '')
            errors.append({
                "level": "error",
                "message": 'Invalid file {}: {}'.format(filename, msg)
            })

        return {
            "Exceptions when copying files": errors
        }
    except Exception as e:
        # Remove full paths from filenames before sending error message to user
        msg = str(e).replace(submission_temp_path + '/', '').replace(submission_path + '/', '')
        return {
            "Exceptions when copying files": [{
                "level": "error",
                "message": msg
            }]
        }

    finally:
        shutil.rmtree(submission_temp_path, ignore_errors=True)


def query_messages_for_data_review(data_review_record, messages):
    if data_review_record.messages:
        data_messages = data_review_record.messages
        data_messages.sort(key=lambda data_message: data_message.id, reverse=True)
        for data_message in data_messages:
            current_user_obj = get_user_from_id(data_message.user)
            messages.append(
                {"message": data_message.message,
                 "user": current_user_obj.email,
                 "post_time": data_message.creation_date})

    return messages


def assign_or_create_review_status(data_table_metadata, publication_recid,
                                   version):
    """
    If a review already exists, it will be attached to the current data record.
    If a review does not exist for a data table, it will be created.

    :param data_table_metadata: the metadata describing the main table.
    :param publication_recid: publication record id
    :param version:
    """
    data_review_query = DataReview.query.filter_by(
        publication_recid=publication_recid, version=version)
    # this method should also create all the DataReviews for data_tables that
    # are not currently present to avoid
    # only creating data reviews when the review is clicked explicitly.
    assigned_tables = []
    if data_review_query.count() > 0:
        data_review_records = data_review_query.all()

        for data_review in data_review_records:
            if data_review.data_recid in data_table_metadata:
                data_table_metadata[data_review.data_recid][
                    "review_flag"] = data_review.status
                data_table_metadata[data_review.data_recid]["review_status"] = \
                    RECORD_PLAIN_TEXT[data_review.status]
                data_table_metadata[data_review.data_recid]["messages"] = len(
                    data_review.messages) > 0
                assigned_tables.append(data_review.data_recid)

    # now create the missing data reviews
    for data_table_id in data_table_metadata:
        if data_table_id not in assigned_tables:
            data_record = create_data_review(
                data_table_id, publication_recid, version=version)
            data_table_metadata[data_table_id][
                "review_flag"] = data_record.status
            data_table_metadata[data_table_id]["review_status"] = \
                RECORD_PLAIN_TEXT[data_record.status]


def determine_user_privileges(recid, ctx):
    # show_review_area = not show_upload_area
    ctx['show_review_widget'] = False
    ctx['show_upload_widget'] = False
    ctx['is_submission_coordinator_or_admin'] = False
    ctx['is_admin'] = False

    if current_user.is_authenticated:
        user_id = current_user.get_id()
        participant_records = get_submission_participants_for_record(recid, user_account=user_id)

        for participant_record in participant_records:
            if participant_record is not None:
                if participant_record.role == 'reviewer' and participant_record.status == 'primary':
                    ctx['show_review_widget'] = True

                if participant_record.role == 'uploader' and participant_record.status == 'primary':
                    ctx['show_upload_widget'] = True

        user = User.query.get(current_user.get_id())
        if has_role(user, 'admin'):
            ctx['is_submission_coordinator_or_admin'] = True
            ctx['is_admin'] = True
        else:
            matching_records = HEPSubmission.query.filter_by(
                publication_recid=recid,
                coordinator=current_user.get_id()).count()

            if matching_records > 0:
                ctx['is_submission_coordinator_or_admin'] = True

        ctx['show_upload_widget'] = (
            ctx['show_upload_widget'] or ctx[
                'is_submission_coordinator_or_admin'])


def process_data_tables(ctx, data_record_query, first_data_id,
                        data_table=None):
    data_table_metadata = OrderedDict()
    ctx['show_upload_area'] = False

    if ctx['show_upload_widget'] and data_record_query.count() == 0:
        ctx['show_upload_area'] = True
    elif data_record_query.count() > 0:
        record_submissions = data_record_query.all()
        for submission_record in record_submissions:
            processed_name = "".join(submission_record.name.split())
            data_table_metadata[submission_record.id] = {
                "id": submission_record.id, "processed_name": processed_name,
                "name": submission_record.name,
                "location": submission_record.location_in_publication,
                # Generate resource metadata
                "resources": get_resource_data(submission_record),
                "doi": submission_record.doi,
                "description": sanitize_html(
                    truncate_string(submission_record.description, 20),
                    tags={},
                    strip=True
                )
            }

            if first_data_id == -1:
                first_data_id = submission_record.id

            if data_table:
                if submission_record.name == data_table:
                    first_data_id = submission_record.id

    return data_table_metadata, first_data_id


def truncate_author_list(record, length=10):
    record['authors'] = record['authors'][:length]


def get_all_ids(index=None, id_field='recid', last_updated=None, latest_first=False):
    """Get all record or inspire ids of publications in the search index

    :param index: name of index to use.
    :param id_field: id type to return. Should be 'recid' or 'inspire_id'
    :return: list of integer ids
    """
    if id_field not in ('recid', 'inspire_id'):
        raise ValueError('Invalid ID field %s' % id_field)

    db_col = HEPSubmission.publication_recid if id_field == 'recid' \
        else HEPSubmission.inspire_id

    # Get unique version
    query = db.session.query(db_col) \
        .filter(HEPSubmission.overall_status == 'finished')

    if last_updated:
        query = query.filter(HEPSubmission.last_updated >= last_updated)

    if latest_first:
        # Use a set to check for duplicates, as sorting by last_updated
        # means distinct doesn't work (as it looks for distinct across both
        # cols)
        query = query.order_by(HEPSubmission.last_updated.desc())
        seen = set()
        seen_add = seen.add
        return [
            int(x[0]) for x in query.all() if not (x[0] in seen or seen_add(x[0]))
        ]
    else:
        query = query.order_by(HEPSubmission.publication_recid).distinct()
        return [int(x[0]) for x in query.all()]


def get_related_hepsubmissions(submission):
    """
    Queries the database for all HEPSubmission objects contained in
    this object's related record ID list.
    (All submissions this one is relating to)

    :return: [list] A list of HEPSubmission objects
    """
    related_submissions = []
    for related in submission.related_recids:
        data_submission = get_latest_hepsubmission(
            publication_recid=related.related_recid
        )
        if data_submission:
            related_submissions.append(data_submission)
    return related_submissions


def get_related_to_this_hepsubmissions(submission):
    """
    Queries the database for all records in the RelatedRecId table
    that have THIS record's id as a related record.
    Then returns the HEPSubmission object marked in the RelatedRecid table.
    Returns only submissions marked as 'finished'

    :return: [list] List containing related records.
    """

    # We use a subquery to get the max version/recid pairing
    subquery = (
        HEPSubmission.query
        .with_entities(
            HEPSubmission.publication_recid,
            func.max(HEPSubmission.version).label('max_version')
        )
        .group_by(HEPSubmission.publication_recid)
        .subquery()
    )

    # Use result of subquery to join and select the max submission where related
    related_submissions = (
        HEPSubmission.query
        .join(subquery, (HEPSubmission.publication_recid == subquery.c.publication_recid) & (
                HEPSubmission.version == subquery.c.max_version))
        .join(RelatedRecid, RelatedRecid.this_recid == HEPSubmission.publication_recid)
        .filter(RelatedRecid.related_recid == submission.publication_recid)
        .all()
    )

    # Set comprehension to determine unique IDs where the max version object is 'finished'
    unique_recids = {sub.publication_recid for sub in related_submissions if sub.overall_status == 'finished'}

    return [get_latest_hepsubmission(publication_recid=recid, overall_status='finished') for recid in unique_recids]


def get_related_datasubmissions(data_submission):
    """
    Queries the database for all DataSubmission objects contained in
    this object's related DOI list.
    (All submissions this one is relating to)

    :param data_submission: The datasubmission object to find related data for.
    :return: [list] A list of DataSubmission objects
    """
    related_submissions = []
    for related in data_submission.related_tables:
        submission = (
            DataSubmission.query
            .filter(DataSubmission.doi == related.related_doi)
            .join(HEPSubmission, HEPSubmission.publication_recid == DataSubmission.publication_recid)
            .first()
        )
        if submission:
            related_submissions.append(submission)
    return related_submissions


def get_related_to_this_datasubmissions(data_submission):
    """
        Get the DataSubmission Objects with a RelatedTable entry
        where this doi is referred to in related_doi.
        Only returns where associated HEPSubmission object is `finished`,
        OR where it is within the same HEPSubmission

        :param data_submission: The datasubmission to find the related entries for.
        :return: [List] List of DataSubmission objects.
    """
    related_submissions = (
        DataSubmission.query
        .join(RelatedTable, RelatedTable.table_doi == DataSubmission.doi)
        .join(HEPSubmission, (HEPSubmission.publication_recid == DataSubmission.publication_recid))
        .group_by(DataSubmission.id)
        .having(func.max(HEPSubmission.version) == DataSubmission.version)
        .filter(RelatedTable.related_doi == data_submission.doi)
        # If finished, OR is part of the same submission
        .filter(
            (HEPSubmission.overall_status == 'finished') | (
                HEPSubmission.publication_recid == data_submission.publication_recid))
        .all()
    )
    return related_submissions


def get_record_data_list(record, data_type):
    """
    Generates a dictionary (title/recid) from a list of record IDs.
    This must be done as the record contents are not stored within the hepsubmission object.

    :param record: The record used for the query.
    :param data_type: Either the related, or related to this data.
    :return: [list] A list of dictionary objects containing record ID and title pairs
    """
    # Selects the related data based on the data_type flag
    data = []
    if data_type == "related":
        data = get_related_hepsubmissions(record)
    elif data_type == "related_to_this":
        data = get_related_to_this_hepsubmissions(record)

    record_data = []
    for datum in data:
        record_data.append(
        {
            "recid": datum.publication_recid,
            "title": get_record_contents(datum.publication_recid)["title"],
            "version": datum.version
        })
    return record_data


def get_table_data_list(table, data_type):
    """
    Generates a list of general information (name, doi, desc) dictionaries of related DataSubmission objects.
    Will either use the related data list (get_related_data_submissions)
    OR the `related to this` list (generated by get_related_to_this_datasubmissions)

    :param table: The DataSubmission object used for querying.
    :param data_type: The flag to decide which relation data to use.
    :return: [list] A list of dictionaries with the name, doi and description of the object.
    """
    # Selects the related data based on the data_type flag
    if data_type == "related":
        data = get_related_datasubmissions(table)
    elif data_type == "related_to_this":
        data = get_related_to_this_datasubmissions(table)

    record_data = []
    if data:
        for datum in data:
            record_data.append({
                "name": datum.name,
                "doi": datum.doi,
                "description": datum.description
            })
    return record_data

HEPData / hepdata / 16028315195

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous