• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

chrismattmann / tika-python / 24964326097

26 Apr 2026 06:49PM UTC coverage: 0.0% (-67.4%) from 67.38%
24964326097

push

github

web-flow
Merge pull request #479 from afuetterer/docs-requirements

build(deps): add dependency-groups

0 of 561 relevant lines covered (0.0%)

0.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/tika/unpack.py
1
# Licensed to the Apache Software Foundation (ASF) under one or more
2
# contributor license agreements.  See the NOTICE file distributed with
3
# this work for additional information regarding copyright ownership.
4
# The ASF licenses this file to You under the Apache License, Version 2.0
5
# (the "License"); you may not use this file except in compliance with
6
# the License.  You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
#
16

17
import csv
×
18
import tarfile
×
19
from contextlib import closing
×
20
from io import BytesIO, TextIOWrapper
×
21

22
from .tika import ServerEndpoint, callServer, parse1
×
23

24
_text_wrapper = TextIOWrapper
×
25

26

27
def from_file(filename, serverEndpoint=ServerEndpoint, requestOptions={}):
×
28
    '''
29
    Parse from file
30
    :param filename: file
31
    :param serverEndpoint: Tika server end point (optional)
32
    :return:
33
    '''
34
    tarOutput = parse1('unpack', filename, serverEndpoint,
×
35
                       responseMimeType='application/x-tar',
36
                       services={'meta': '/meta', 'text': '/tika',
37
                                 'all': '/rmeta/xml', 'unpack': '/unpack/all'},
38
                       rawResponse=True, requestOptions=requestOptions)
39
    return _parse(tarOutput)
×
40

41

42
def from_buffer(string, serverEndpoint=ServerEndpoint, headers=None, requestOptions={}):
×
43
    '''
44
    Parse from buffered content
45
    :param string:  buffered content
46
    :param serverEndpoint: Tika server URL (Optional)
47
    :return: parsed content
48
    '''
49

50
    headers = headers or {}
×
51
    headers.update({'Accept': 'application/x-tar'})
×
52

53
    status, response = callServer('put', serverEndpoint, '/unpack/all', string,
×
54
                                  headers, False,
55
                                  rawResponse=True, requestOptions=requestOptions)
56

57
    return _parse((status, response))
×
58

59

60
def _parse(tarOutput):
×
61
    parsed = {}
×
62
    if not tarOutput:
×
63
        return parsed
×
64
    elif tarOutput[1] is None or tarOutput[1] == b"":
×
65
        return parsed
×
66

67
    with tarfile.open(fileobj=BytesIO(tarOutput[1])) as tarFile:
×
68
        # get the member names
69
        memberNames = list(tarFile.getnames())
×
70

71
        # extract the metadata
72
        metadata = {}
×
73
        if "__METADATA__" in memberNames:
×
74
            memberNames.remove("__METADATA__")
×
75

76
        metadataMember = tarFile.getmember("__METADATA__")
×
77
        if not metadataMember.issym() and metadataMember.isfile():
×
78
            with closing(_text_wrapper(tarFile.extractfile(metadataMember), encoding=tarFile.encoding)) as metadataFile:
×
79
                metadataReader = csv.reader(_truncate_nulls(metadataFile))
×
80
                for metadataLine in metadataReader:
×
81
                    # each metadata line comes as a key-value pair, with list values
82
                    # returned as extra values in the line - convert single values
83
                    # to non-list values to be consistent with parser metadata
84
                    assert len(metadataLine) >= 2
×
85

86
                    if len(metadataLine) > 2:
×
87
                        metadata[metadataLine[0]] = metadataLine[1:]
×
88
                    else:
89
                        metadata[metadataLine[0]] = metadataLine[1]
×
90

91

92
        # get the content
93
        content = ""
×
94
        if "__TEXT__" in memberNames:
×
95
            memberNames.remove("__TEXT__")
×
96

97
            contentMember = tarFile.getmember("__TEXT__")
×
98
            if not contentMember.issym() and contentMember.isfile():
×
99
                with closing(_text_wrapper(tarFile.extractfile(contentMember), encoding='utf8')) as content_file:
×
100
                    content = content_file.read()
×
101

102
        # get the remaining files as attachments
103
        attachments = {}
×
104
        for attachment in memberNames:
×
105
            attachmentMember = tarFile.getmember(attachment)
×
106
            if not attachmentMember.issym() and attachmentMember.isfile():
×
107
                with closing(tarFile.extractfile(attachmentMember)) as attachment_file:
×
108
                    attachments[attachment] = attachment_file.read()
×
109

110
        parsed["content"] = content
×
111
        parsed["metadata"] = metadata
×
112
        parsed["attachments"] = attachments
×
113

114
        return parsed
×
115

116

117
# TODO: Remove if/when fixed. https://issues.apache.org/jira/browse/TIKA-3070
118
def _truncate_nulls(s):
×
119
    for line in s:
×
120
        yield line.replace('\0', '')
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc