• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In
Recached

chrismattmann / tika-python / 24964326097

26 Apr 2026 06:49PM UTC coverage: 0.0% (-67.4%) from 67.38%
24964326097

push

github

web-flow
Merge pull request #479 from afuetterer/docs-requirements

build(deps): add dependency-groups

0 of 561 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/tika/parser.py
1
# Licensed to the Apache Software Foundation (ASF) under one or more
2
# contributor license agreements.  See the NOTICE file distributed with
3
# this work for additional information regarding copyright ownership.
4
# The ASF licenses this file to You under the Apache License, Version 2.0
5
# (the "License"); you may not use this file except in compliance with
6
# the License.  You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
#
16

17
import json
×
18

19
from .tika import ServerEndpoint, callServer, parse1
×
20

21

22
def from_file(filename, serverEndpoint=ServerEndpoint, service='all', xmlContent=False, headers=None, config_path=None, requestOptions={}, raw_response=False):
×
23
    '''
24
    Parses a file for metadata and content
25
    :param filename: path to file which needs to be parsed or binary file using open(path,'rb')
26
    :param serverEndpoint: Server endpoint url
27
    :param service: service requested from the tika server
28
                    Default is 'all', which results in recursive text content+metadata.
29
                    'meta' returns only metadata
30
                    'text' returns only content
31
    :param xmlContent: Whether or not XML content be requested.
32
                    Default is 'False', which results in text content.
33
    :param headers: Request headers to be sent to the tika reset server, should
34
                    be a dictionary. This is optional
35
    :return: dictionary having 'metadata' and 'content' keys.
36
            'content' has a str value and metadata has a dict type value.
37
    '''
38
    if not xmlContent:
×
39
        output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
×
40
    else:
41
        output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
×
42
                            headers=headers, config_path=config_path, requestOptions=requestOptions)
43
    if raw_response:
×
44
        return output
×
45
    else:
46
        return _parse(output, service)
×
47

48

49
def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}, raw_response=False):
×
50
    '''
51
    Parses the content from buffer
52
    :param string: Buffer value
53
    :param serverEndpoint: Server endpoint. This is optional
54
    :param xmlContent: Whether or not XML content be requested.
55
                    Default is 'False', which results in text content.
56
    :param headers: Request headers to be sent to the tika reset server, should
57
                    be a dictionary. This is optional
58
    :return:
59
    '''
60
    headers = headers or {}
×
61
    headers.update({'Accept': 'application/json'})
×
62

63
    if not xmlContent:
×
64
        status, response = callServer('put', serverEndpoint, '/rmeta/text', string, headers, False, config_path=config_path, requestOptions=requestOptions)
×
65
    else:
66
        status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, headers, False, config_path=config_path, requestOptions=requestOptions)
×
67

68
    if raw_response:
×
69
        return (status, response)
×
70
    else:
71
        return _parse((status,response))
×
72

73
def _parse(output, service='all'):
×
74
    '''
75
    Parses response from Tika REST API server
76
    :param output: output from Tika Server
77
    :param service: service requested from the tika server
78
                    Default is 'all', which results in recursive text content+metadata.
79
                    'meta' returns only metadata
80
                    'text' returns only content
81
    :return: a dictionary having 'metadata' and 'content' values
82
    '''
83
    parsed={'metadata': None, 'content': None}
×
84
    if not output:
×
85
        return parsed
×
86

87
    parsed["status"] = output[0]
×
88
    if output[1] is None or output[1] == "":
×
89
        return parsed
×
90

91
    if service == "text":
×
92
        parsed["content"] = output[1]
×
93
        return parsed
×
94

95
    realJson = json.loads(output[1])
×
96

97
    parsed["metadata"] = {}
×
98
    if service == "meta":
×
99
        for key in realJson:
×
100
            parsed["metadata"][key] = realJson[key]
×
101
        return parsed
×
102

103
    content = ""
×
104
    for js in realJson:
×
105
        if "X-TIKA:content" in js:
×
106
            content += js["X-TIKA:content"]
×
107

108
    if content == "":
×
109
        content = None
×
110

111
    parsed["content"] = content
×
112

113
    for js in realJson:
×
114
        for n in js:
×
115
            if n != "X-TIKA:content":
×
116
                if n in parsed["metadata"]:
×
117
                    if not isinstance(parsed["metadata"][n], list):
×
118
                        parsed["metadata"][n] = [parsed["metadata"][n]]
×
119
                    parsed["metadata"][n].append(js[n])
×
120
                else:
121
                    parsed["metadata"][n] = js[n]
×
122

123
    return parsed
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc