• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rero / sonar / 17425918180

03 Sep 2025 07:11AM UTC coverage: 95.796% (-0.6%) from 96.378%
17425918180

push

github

PascalRepond
translations: extract messages

Co-Authored-by: Pascal Repond <pascal.repond@rero.ch>

7816 of 8159 relevant lines covered (95.8%)

0.96 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.55
/sonar/modules/pdf_extractor/pdf_extractor.py
1
# Swiss Open Access Repository
2
# Copyright (C) 2021 RERO
3
#
4
# This program is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU Affero General Public License as published by
6
# the Free Software Foundation, version 3 of the License.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU Affero General Public License for more details.
12
#
13
# You should have received a copy of the GNU Affero General Public License
14
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
15

16
"""PDF extractor class."""
17

18
import os
1✔
19
import tempfile
1✔
20
import xml.etree.ElementTree as ET
1✔
21
from io import StringIO
1✔
22

23
import requests
1✔
24
import xmltodict
1✔
25
from flask import current_app
1✔
26

27

28
class PDFExtractor:
1✔
29
    """PDF extractor class."""
30

31
    api_url = ""
1✔
32

33
    def __init__(self):
1✔
34
        """Init PDF extractor."""
35
        self._load_config()
1✔
36

37
    def _load_config(self):
1✔
38
        """Load configuration from extension."""
39
        server = current_app.config.get("PDF_EXTRACTOR_GROBID_SERVER", "localhost")
1✔
40
        port = current_app.config.get("PDF_EXTRACTOR_GROBID_PORT", "8070")
1✔
41
        self.api_url = f"http://{server}:{port}/api"
1✔
42
        if not self.api_is_alive():
1✔
43
            raise ConnectionRefusedError
1✔
44

45
    def api_is_alive(self):
1✔
46
        """Test if api is up.
47

48
        :returns: (bool) Return wether grobid service is up or not
49
        """
50
        try:
1✔
51
            response, status = self.do_request("isalive", "get")
1✔
52
        except Exception:
1✔
53
            return False
1✔
54

55
        if status != 200:
1✔
56
            return False
1✔
57

58
        return bool(response)
1✔
59

60
    def do_request(self, endpoint, request_type="get", files=None):
1✔
61
        """Do request on Grobid api.
62

63
        :param endpoint: (str) Endpoint of API to query
64
        :param request: (str) Request type (get or post)
65
        :param files: (dict) files to post (Multipart-encoded files)
66
        :returns: (tuple) Tuple containing response text and status
67
        """
68
        url = f"{self.api_url}/{endpoint}"
1✔
69

70
        if request_type.lower() not in ["get", "post"]:
1✔
71
            raise ValueError
1✔
72

73
        if request_type.lower() == "get":
1✔
74
            response = requests.get(url)
1✔
75
            return response.content, response.status_code
1✔
76

77
        if request_type.lower() == "post":
1✔
78
            headers = {"Accept": "application/xml"}
1✔
79
            response = requests.post(url, headers=headers, files=files)
1✔
80
            return response.text, response.status_code
1✔
81
        return None
×
82

83
    def process(self, input_file, output_file=None, dict_output=True):
1✔
84
        """Process metadata extraction from file.
85

86
        :param input_file: (str) Path to PDF file.
87
        :param output_file: (str) Output file where to dump extraction.
88
        :param dict_output: (bool) Extraction will be formatted in JSON.
89
        :returns: (str|dict|None) Metadata extraction, if output file is not
90
        None, data will be put into file
91
        """
92
        output = self.extract_metadata(input_file)
1✔
93

94
        # Dump xml output into given file
95
        if output_file:
1✔
96
            with open(output_file, "w") as file:
1✔
97
                file.write(output)
1✔
98
            return None
1✔
99

100
        # Return output as xml
101
        if not dict_output:
1✔
102
            return output
1✔
103

104
        # Transform xml to dictionary
105
        return self.parse_tei_xml(output)
1✔
106

107
    def process_raw(self, pdf_content, output_file=None, dict_output=True):
1✔
108
        """Metadata extraction from raw content.
109

110
        :param pdf_content: (str) PDF content.
111
        :param output_file: (str) Output file where to dump extraction.
112
        :param dict_output: (bool) Extraction will be formatted in JSON.
113
        :returns: (str|json) Metadata extraction
114
        """
115
        with tempfile.NamedTemporaryFile(mode="w+b", suffix=".pdf") as temp:
1✔
116
            temp.write(pdf_content)
1✔
117

118
            return self.process(temp.name, output_file=output_file, dict_output=dict_output)
1✔
119

120
    def extract_metadata(self, file):
1✔
121
        """Process metadata extraction.
122

123
        :param file: (str) Path to PDF file.
124
        :returns: (str) Extraction metadata as TEI XML
125
        """
126
        if not os.path.isfile(file):
1✔
127
            raise ValueError("Input file does not exist")
1✔
128

129
        if not file.lower().endswith(".pdf"):
1✔
130
            raise ValueError("Input file is not a valid PDF file")
1✔
131

132
        with open(file, "rb") as f:
1✔
133
            response, status = self.do_request(
1✔
134
                "processHeaderDocument",
135
                "post",
136
                files={
137
                    "input": (file, f, "application/pdf"),
138
                    "consolidateHeader": 1,
139
                },
140
            )
141

142
            if status != 200:
1✔
143
                raise Exception("Metadata extraction failed")
1✔
144

145
            return response
1✔
146

147
    @staticmethod
1✔
148
    def parse_tei_xml(xml):
1✔
149
        """Parse xml content."""
150
        iterator = ET.iterparse(StringIO(xml))
1✔
151
        for _, element in iterator:
1✔
152
            if "}" in element.tag:
1✔
153
                element.tag = element.tag.split("}", 1)[1]
1✔
154
        root = iterator.root
1✔
155

156
        # parse xml
157
        result = xmltodict.parse(ET.tostring(root, encoding="unicode"))
1✔
158
        return result["TEI"]
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc