24964326097

Committed 26 Apr 2026 06:49PM UTC coverage: 0.0% (-67.4%) from 67.38%

Build # 24964326097

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #479 from afuetterer/docs-requirements

build(deps): add dependency-groups

Coverage Stats

0 of 561 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/tika/pdf.py

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from io import StringIO

from bs4 import BeautifulSoup

from tika import parser


def text_from_pdf_pages(filename):
    pages_txt = []

    # Read PDF file
    data = parser.from_file(filename, xmlContent=True)
    xhtml_data = BeautifulSoup(data['content'])
    for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
        # Parse PDF data using TIKA (xml/html)
        # It's faster and safer to create a new buffer than truncating it
        # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
        _buffer = StringIO()
        _buffer.write(str(content))
        parsed_content = parser.from_buffer(_buffer.getvalue())

        # Add pages
        text = parsed_content['content'].strip()
        pages_txt.append(text)

    return pages_txt

1	# Licensed to the Apache Software Foundation (ASF) under one or more
2	# contributor license agreements. See the NOTICE file distributed with
3	# this work for additional information regarding copyright ownership.
4	# The ASF licenses this file to You under the Apache License, Version 2.0
5	# (the "License"); you may not use this file except in compliance with
6	# the License. You may obtain a copy of the License at
7	#
8	# http://www.apache.org/licenses/LICENSE-2.0
9	#
10	# Unless required by applicable law or agreed to in writing, software
11	# distributed under the License is distributed on an "AS IS" BASIS,
12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	# See the License for the specific language governing permissions and
14	# limitations under the License.
15	#
16
17	from io import StringIO	×
18
19	from bs4 import BeautifulSoup	×
20
21	from tika import parser	×
22
23
24	def text_from_pdf_pages(filename):	×
25	pages_txt = []	×
26
27	# Read PDF file
28	data = parser.from_file(filename, xmlContent=True)	×
29	xhtml_data = BeautifulSoup(data['content'])	×
30	for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):	×
31	# Parse PDF data using TIKA (xml/html)
32	# It's faster and safer to create a new buffer than truncating it
33	# https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
34	_buffer = StringIO()	×
35	_buffer.write(str(content))	×
36	parsed_content = parser.from_buffer(_buffer.getvalue())	×
37
38	# Add pages
39	text = parsed_content['content'].strip()	×
40	pages_txt.append(text)	×
41
42	return pages_txt	×

chrismattmann / tika-python / 24964326097

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous