• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tcalmant / python-javaobj / 8591112367

07 Apr 2024 07:18PM UTC coverage: 78.701%. First build
8591112367

push

github

tcalmant
Added 3.11 & 3.12 to GitHub actions

1611 of 2047 relevant lines covered (78.7%)

4.71 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

63.33
/javaobj/modifiedutf8.py
1
#!/usr/bin/python
2
# -- Content-Encoding: utf-8 --
3
"""
6✔
4
Implements the support of the Java-specific kind of UTF-8 encoding.
5

6
This module is a modified version of ``py2jdbc.mutf8`` provided by
7
`@guywithface <https://github.com/guywithface>`_.
8

9
The project the original file comes from is available at:
10
https://github.com/swstephe/py2jdbc/
11

12
:authors: Scott Stephens (@swstephe), @guywithface
13
:license: Apache License 2.0
14
:version: 0.4.4
15
:status: Alpha
16
"""
17

18
from __future__ import unicode_literals
6✔
19

20
import sys
6✔
21

22

23
# Module version
24
__version_info__ = (0, 4, 4)
6✔
25
__version__ = ".".join(str(x) for x in __version_info__)
6✔
26

27
# Documentation strings format
28
__docformat__ = "restructuredtext en"
6✔
29

30
# Encoding name: not cesu-8, which uses a different zero-byte
31
NAME = "mutf8"
6✔
32

33
# ------------------------------------------------------------------------------
34

35
if sys.version_info[0] >= 3:
6✔
36
    unicode_char = chr  # pylint:disable=C0103
6✔
37

38
    def byte_to_int(data):
6✔
39
        # type: (bytes) -> int
40
        """
41
        Converts the first byte of the given data to an integer
42
        """
43
        if isinstance(data, int):
6✔
44
            return data
6✔
45

46
        if isinstance(data, bytes):
×
47
            return data[0]
×
48

49
        raise ValueError(
×
50
            "Expected byte or int as input, got: {0}".format(
51
                type(data).__name__
52
            )
53
        )
54

55

56
else:
57
    unicode_char = (
×
58
        unichr  # pylint:disable=C0103,undefined-variable  # noqa: F821
59
    )
60

61
    def byte_to_int(data):
×
62
        # type: (bytes) -> int
63
        """
64
        Converts the first byte of the given data to an integer
65
        """
66
        if isinstance(data, int):
×
67
            return data
×
68

69
        if isinstance(data, str):
×
70
            return ord(data[0])
×
71

72
        raise ValueError(
×
73
            "Expected byte or int as input, got: {0}".format(
74
                type(data).__name__
75
            )
76
        )
77

78

79
# ------------------------------------------------------------------------------
80

81

82
class DecodeMap(object):  # pylint:disable=R0205
6✔
83
    """
84
    A utility class which manages masking, comparing and mapping in bits.
85
    If the mask and compare fails, this will raise UnicodeDecodeError so
86
    encode and decode will correctly handle bad characters.
87
    """
88

89
    def __init__(self, count, mask, value, bits):
6✔
90
        """
91
        Initialize a DecodeMap, entry from a static dictionary for the module.
92
        It automatically calculates the mask for the bits for the value
93
        (always assumed to be at the bottom of the byte).
94

95
        :param count: The number of bytes in this entire sequence.
96
        :param mask: The mask to apply to the byte at this position.
97
        :param value: The value of masked bits, (without shifting).
98
        :param bits: The number of bits.
99
        """
100
        self.count = count
6✔
101
        self.mask = mask
6✔
102
        self.value = value
6✔
103
        self.bits = bits
6✔
104
        self.mask2 = (1 << bits) - 1
6✔
105

106
    def apply(self, byte, value, data, i, count):
6✔
107
        """
108
        Apply mask, compare to expected value, shift and return result.
109
        Eventually, this could become a ``reduce`` function.
110

111
        :param byte: The byte to compare
112
        :param value: The currently accumulated value.
113
        :param data: The data buffer, (array of bytes).
114
        :param i: The position within the data buffer.
115
        :param count: The position of this comparison.
116
        :return: A new value with the bits merged in.
117
        :raises UnicodeDecodeError: if marked bits don't match.
118
        """
119
        if byte & self.mask == self.value:
6✔
120
            value <<= self.bits
6✔
121
            value |= byte & self.mask2
6✔
122
        else:
123
            raise UnicodeDecodeError(
×
124
                NAME,
125
                data,
126
                i,
127
                i + count,
128
                "invalid {}-byte sequence".format(self.count),
129
            )
130
        return value
6✔
131

132
    def __repr__(self):
6✔
133
        return "DecodeMap({})".format(
×
134
            ", ".join(
135
                "{}=0x{:02x}".format(n, getattr(self, n))
136
                for n in ("count", "mask", "value", "bits", "mask2")
137
            )
138
        )
139

140

141
DECODER_MAP = {
6✔
142
    2: ((0xC0, 0x80, 6),),
143
    3: ((0xC0, 0x80, 6), (0xC0, 0x80, 6)),
144
    6: (
145
        (0xF0, 0xA0, 4),
146
        (0xC0, 0x80, 6),
147
        (0xFF, 0xED, 0),
148
        (0xF0, 0xB0, 4),
149
        (0xC0, 0x80, 6),
150
    ),
151
}
152

153
DECODE_MAP = dict(
6✔
154
    (k, tuple(DecodeMap(k, *vv) for vv in v)) for k, v in DECODER_MAP.items()
155
)
156

157

158
def decoder(data):
6✔
159
    """
160
    This generator processes a sequence of bytes in Modified UTF-8 encoding
161
    and produces a sequence of unicode string characters.
162

163
    It takes bits from the byte until it matches one of the known encoding
164
    sequences.
165
    It uses ``DecodeMap`` to mask, compare and generate values.
166

167
    :param data: a string of bytes in Modified UTF-8 encoding.
168
    :return: a generator producing a string of unicode characters
169
    :raises UnicodeDecodeError: unrecognised byte in sequence encountered.
170
    """
171

172
    def next_byte(_it, start, count):
6✔
173
        try:
6✔
174
            return next(_it)[1]
6✔
175
        except StopIteration:
×
176
            raise UnicodeDecodeError(
×
177
                NAME, data, start, start + count, "incomplete byte sequence"
178
            )
179

180
    it = iter(enumerate(data))
6✔
181
    for i, d in it:
6✔
182
        if d == 0x00:  # 00000000
6✔
183
            raise UnicodeDecodeError(
×
184
                NAME, data, i, i + 1, "embedded zero-byte not allowed"
185
            )
186

187
        if d & 0x80:  # 1xxxxxxx
6✔
188
            if d & 0x40:  # 11xxxxxx
6✔
189
                if d & 0x20:  # 111xxxxx
6✔
190
                    if d & 0x10:  # 1111xxxx
6✔
191
                        raise UnicodeDecodeError(
×
192
                            NAME, data, i, i + 1, "invalid encoding character"
193
                        )
194

195
                    if d == 0xED:
6✔
196
                        value = 0
×
197
                        for i1, dm in enumerate(DECODE_MAP[6]):
×
198
                            d1 = next_byte(it, i, i1 + 1)
×
199
                            value = dm.apply(d1, value, data, i, i1 + 1)
×
200
                    else:  # 1110xxxx
201
                        value = d & 0x0F
6✔
202
                        for i1, dm in enumerate(DECODE_MAP[3]):
6✔
203
                            d1 = next_byte(it, i, i1 + 1)
6✔
204
                            value = dm.apply(d1, value, data, i, i1 + 1)
6✔
205
                else:  # 110xxxxx
206
                    value = d & 0x1F
×
207
                    for i1, dm in enumerate(DECODE_MAP[2]):
×
208
                        d1 = next_byte(it, i, i1 + 1)
×
209
                        value = dm.apply(d1, value, data, i, i1 + 1)
×
210
            else:  # 10xxxxxx
211
                raise UnicodeDecodeError(
×
212
                    NAME, data, i, i + 1, "misplaced continuation character"
213
                )
214
        else:  # 0xxxxxxx
215
            value = d
6✔
216
        # noinspection PyCompatibility
217
        yield mutf8_unichr(value)
6✔
218

219

220
def decode_modified_utf8(data, errors="strict"):
6✔
221
    """
222
    Decodes a sequence of bytes to a unicode text and length using
223
    Modified UTF-8.
224
    This function is designed to be used with Python ``codecs`` module.
225

226
    :param data: a string of bytes in Modified UTF-8
227
    :param errors: handle decoding errors
228
    :return: unicode text and length
229
    :raises UnicodeDecodeError: sequence is invalid.
230
    """
231
    value, length = "", 0
6✔
232
    it = iter(decoder(byte_to_int(d) for d in data))
6✔
233
    while True:
4✔
234
        try:
6✔
235
            value += next(it)
6✔
236
            length += 1
6✔
237
        except StopIteration:
6✔
238
            break
6✔
239
        except UnicodeDecodeError as e:
×
240
            if errors == "strict":
×
241
                raise e
×
242

243
            if errors == "ignore":
×
244
                pass
×
245
            elif errors == "replace":
×
246
                value += "\uFFFD"
×
247
                length += 1
×
248
    return value, length
6✔
249

250

251
def mutf8_unichr(value):
6✔
252
    """
253
    Mimics Python 2 unichr() and Python 3 chr()
254
    """
255
    return unicode_char(value)
6✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc