17425918180

Committed 03 Sep 2025 07:11AM UTC coverage: 95.796% (-0.6%) from 96.378%

Build # 17425918180

Build Type

push

github

Committed by

PascalRepond

Commit Message

translations: extract messages

Co-Authored-by: Pascal Repond <pascal.repond@rero.ch>

Run Details

7816 of 8159 relevant lines covered (95.8%)

0.96 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.55

/sonar/modules/pdf_extractor/pdf_extractor.py

# Swiss Open Access Repository
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""PDF extractor class."""

import os
import tempfile
import xml.etree.ElementTree as ET
from io import StringIO

import requests
import xmltodict
from flask import current_app


class PDFExtractor:
    """PDF extractor class."""

    api_url = ""

    def __init__(self):
        """Init PDF extractor."""
        self._load_config()

    def _load_config(self):
        """Load configuration from extension."""
        server = current_app.config.get("PDF_EXTRACTOR_GROBID_SERVER", "localhost")
        port = current_app.config.get("PDF_EXTRACTOR_GROBID_PORT", "8070")
        self.api_url = f"http://{server}:{port}/api"
        if not self.api_is_alive():
            raise ConnectionRefusedError

    def api_is_alive(self):
        """Test if api is up.

        :returns: (bool) Return wether grobid service is up or not
        """
        try:
            response, status = self.do_request("isalive", "get")
        except Exception:
            return False

        if status != 200:
            return False

        return bool(response)

    def do_request(self, endpoint, request_type="get", files=None):
        """Do request on Grobid api.

        :param endpoint: (str) Endpoint of API to query
        :param request: (str) Request type (get or post)
        :param files: (dict) files to post (Multipart-encoded files)
        :returns: (tuple) Tuple containing response text and status
        """
        url = f"{self.api_url}/{endpoint}"

        if request_type.lower() not in ["get", "post"]:
            raise ValueError

        if request_type.lower() == "get":
            response = requests.get(url)
            return response.content, response.status_code

        if request_type.lower() == "post":
            headers = {"Accept": "application/xml"}
            response = requests.post(url, headers=headers, files=files)
            return response.text, response.status_code
        return None

    def process(self, input_file, output_file=None, dict_output=True):
        """Process metadata extraction from file.

        :param input_file: (str) Path to PDF file.
        :param output_file: (str) Output file where to dump extraction.
        :param dict_output: (bool) Extraction will be formatted in JSON.
        :returns: (str|dict|None) Metadata extraction, if output file is not
        None, data will be put into file
        """
        output = self.extract_metadata(input_file)

        # Dump xml output into given file
        if output_file:
            with open(output_file, "w") as file:
                file.write(output)
            return None

        # Return output as xml
        if not dict_output:
            return output

        # Transform xml to dictionary
        return self.parse_tei_xml(output)

    def process_raw(self, pdf_content, output_file=None, dict_output=True):
        """Metadata extraction from raw content.

        :param pdf_content: (str) PDF content.
        :param output_file: (str) Output file where to dump extraction.
        :param dict_output: (bool) Extraction will be formatted in JSON.
        :returns: (str|json) Metadata extraction
        """
        with tempfile.NamedTemporaryFile(mode="w+b", suffix=".pdf") as temp:
            temp.write(pdf_content)

            return self.process(temp.name, output_file=output_file, dict_output=dict_output)

    def extract_metadata(self, file):
        """Process metadata extraction.

        :param file: (str) Path to PDF file.
        :returns: (str) Extraction metadata as TEI XML
        """
        if not os.path.isfile(file):
            raise ValueError("Input file does not exist")

        if not file.lower().endswith(".pdf"):
            raise ValueError("Input file is not a valid PDF file")

        with open(file, "rb") as f:
            response, status = self.do_request(
                "processHeaderDocument",
                "post",
                files={
                    "input": (file, f, "application/pdf"),
                    "consolidateHeader": 1,
                },
            )

            if status != 200:
                raise Exception("Metadata extraction failed")

            return response

    @staticmethod
    def parse_tei_xml(xml):
        """Parse xml content."""
        iterator = ET.iterparse(StringIO(xml))
        for _, element in iterator:
            if "}" in element.tag:
                element.tag = element.tag.split("}", 1)[1]
        root = iterator.root

        # parse xml
        result = xmltodict.parse(ET.tostring(root, encoding="unicode"))
        return result["TEI"]

1	# Swiss Open Access Repository
2	# Copyright (C) 2021 RERO
3	#
4	# This program is free software: you can redistribute it and/or modify
5	# it under the terms of the GNU Affero General Public License as published by
6	# the Free Software Foundation, version 3 of the License.
7	#
8	# This program is distributed in the hope that it will be useful,
9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	# GNU Affero General Public License for more details.
12	#
13	# You should have received a copy of the GNU Affero General Public License
14	# along with this program. If not, see <http://www.gnu.org/licenses/>.
15
16	"""PDF extractor class."""
17
18	import os	1✔
19	import tempfile	1✔
20	import xml.etree.ElementTree as ET	1✔
21	from io import StringIO	1✔
22
23	import requests	1✔
24	import xmltodict	1✔
25	from flask import current_app	1✔
26
27
28	class PDFExtractor:	1✔
29	"""PDF extractor class."""
30
31	api_url = ""	1✔
32
33	def __init__(self):	1✔
34	"""Init PDF extractor."""
35	self._load_config()	1✔
36
37	def _load_config(self):	1✔
38	"""Load configuration from extension."""
39	server = current_app.config.get("PDF_EXTRACTOR_GROBID_SERVER", "localhost")	1✔
40	port = current_app.config.get("PDF_EXTRACTOR_GROBID_PORT", "8070")	1✔
41	self.api_url = f"http://{server}:{port}/api"	1✔
42	if not self.api_is_alive():	1✔
43	raise ConnectionRefusedError	1✔
44
45	def api_is_alive(self):	1✔
46	"""Test if api is up.
47
48	:returns: (bool) Return wether grobid service is up or not
49	"""
50	try:	1✔
51	response, status = self.do_request("isalive", "get")	1✔
52	except Exception:	1✔
53	return False	1✔
54
55	if status != 200:	1✔
56	return False	1✔
57
58	return bool(response)	1✔
59
60	def do_request(self, endpoint, request_type="get", files=None):	1✔
61	"""Do request on Grobid api.
62
63	:param endpoint: (str) Endpoint of API to query
64	:param request: (str) Request type (get or post)
65	:param files: (dict) files to post (Multipart-encoded files)
66	:returns: (tuple) Tuple containing response text and status
67	"""
68	url = f"{self.api_url}/{endpoint}"	1✔
69
70	if request_type.lower() not in ["get", "post"]:	1✔
71	raise ValueError	1✔
72
73	if request_type.lower() == "get":	1✔
74	response = requests.get(url)	1✔
75	return response.content, response.status_code	1✔
76
77	if request_type.lower() == "post":	1✔
78	headers = {"Accept": "application/xml"}	1✔
79	response = requests.post(url, headers=headers, files=files)	1✔
80	return response.text, response.status_code	1✔
81	return None	×
82
83	def process(self, input_file, output_file=None, dict_output=True):	1✔
84	"""Process metadata extraction from file.
85
86	:param input_file: (str) Path to PDF file.
87	:param output_file: (str) Output file where to dump extraction.
88	:param dict_output: (bool) Extraction will be formatted in JSON.
89	:returns: (str\|dict\|None) Metadata extraction, if output file is not
90	None, data will be put into file
91	"""
92	output = self.extract_metadata(input_file)	1✔
93
94	# Dump xml output into given file
95	if output_file:	1✔
96	with open(output_file, "w") as file:	1✔
97	file.write(output)	1✔
98	return None	1✔
99
100	# Return output as xml
101	if not dict_output:	1✔
102	return output	1✔
103
104	# Transform xml to dictionary
105	return self.parse_tei_xml(output)	1✔
106
107	def process_raw(self, pdf_content, output_file=None, dict_output=True):	1✔
108	"""Metadata extraction from raw content.
109
110	:param pdf_content: (str) PDF content.
111	:param output_file: (str) Output file where to dump extraction.
112	:param dict_output: (bool) Extraction will be formatted in JSON.
113	:returns: (str\|json) Metadata extraction
114	"""
115	with tempfile.NamedTemporaryFile(mode="w+b", suffix=".pdf") as temp:	1✔
116	temp.write(pdf_content)	1✔
117
118	return self.process(temp.name, output_file=output_file, dict_output=dict_output)	1✔
119
120	def extract_metadata(self, file):	1✔
121	"""Process metadata extraction.
122
123	:param file: (str) Path to PDF file.
124	:returns: (str) Extraction metadata as TEI XML
125	"""
126	if not os.path.isfile(file):	1✔
127	raise ValueError("Input file does not exist")	1✔
128
129	if not file.lower().endswith(".pdf"):	1✔
130	raise ValueError("Input file is not a valid PDF file")	1✔
131
132	with open(file, "rb") as f:	1✔
133	response, status = self.do_request(	1✔
134	"processHeaderDocument",
135	"post",
136	files={
137	"input": (file, f, "application/pdf"),
138	"consolidateHeader": 1,
139	},
140	)
141
142	if status != 200:	1✔
143	raise Exception("Metadata extraction failed")	1✔
144
145	return response	1✔
146
147	@staticmethod	1✔
148	def parse_tei_xml(xml):	1✔
149	"""Parse xml content."""
150	iterator = ET.iterparse(StringIO(xml))	1✔
151	for _, element in iterator:	1✔
152	if "}" in element.tag:	1✔
153	element.tag = element.tag.split("}", 1)[1]	1✔
154	root = iterator.root	1✔
155
156	# parse xml
157	result = xmltodict.parse(ET.tostring(root, encoding="unicode"))	1✔
158	return result["TEI"]	1✔

rero / sonar / 17425918180

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous