8591112367

Committed 07 Apr 2024 07:18PM UTC coverage: 78.701%. First build

Build # 8591112367

Build Type

push

github

Committed by

tcalmant

Commit Message

Added 3.11 & 3.12 to GitHub actions

Run Details

1611 of 2047 relevant lines covered (78.7%)

4.71 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

63.33

/javaobj/modifiedutf8.py

#!/usr/bin/python
# -- Content-Encoding: utf-8 --
"""
Implements the support of the Java-specific kind of UTF-8 encoding.

This module is a modified version of ``py2jdbc.mutf8`` provided by
`@guywithface <https://github.com/guywithface>`_.

The project the original file comes from is available at:
https://github.com/swstephe/py2jdbc/

:authors: Scott Stephens (@swstephe), @guywithface
:license: Apache License 2.0
:version: 0.4.4
:status: Alpha
"""

from __future__ import unicode_literals

import sys


# Module version
__version_info__ = (0, 4, 4)
__version__ = ".".join(str(x) for x in __version_info__)

# Documentation strings format
__docformat__ = "restructuredtext en"

# Encoding name: not cesu-8, which uses a different zero-byte
NAME = "mutf8"

# ------------------------------------------------------------------------------

if sys.version_info[0] >= 3:
    unicode_char = chr  # pylint:disable=C0103

    def byte_to_int(data):
        # type: (bytes) -> int
        """
        Converts the first byte of the given data to an integer
        """
        if isinstance(data, int):
            return data

        if isinstance(data, bytes):
            return data[0]

        raise ValueError(
            "Expected byte or int as input, got: {0}".format(
                type(data).__name__
            )
        )


else:
    unicode_char = (
        unichr  # pylint:disable=C0103,undefined-variable  # noqa: F821
    )

    def byte_to_int(data):
        # type: (bytes) -> int
        """
        Converts the first byte of the given data to an integer
        """
        if isinstance(data, int):
            return data

        if isinstance(data, str):
            return ord(data[0])

        raise ValueError(
            "Expected byte or int as input, got: {0}".format(
                type(data).__name__
            )
        )


# ------------------------------------------------------------------------------


class DecodeMap(object):  # pylint:disable=R0205
    """
    A utility class which manages masking, comparing and mapping in bits.
    If the mask and compare fails, this will raise UnicodeDecodeError so
    encode and decode will correctly handle bad characters.
    """

    def __init__(self, count, mask, value, bits):
        """
        Initialize a DecodeMap, entry from a static dictionary for the module.
        It automatically calculates the mask for the bits for the value
        (always assumed to be at the bottom of the byte).

        :param count: The number of bytes in this entire sequence.
        :param mask: The mask to apply to the byte at this position.
        :param value: The value of masked bits, (without shifting).
        :param bits: The number of bits.
        """
        self.count = count
        self.mask = mask
        self.value = value
        self.bits = bits
        self.mask2 = (1 << bits) - 1

    def apply(self, byte, value, data, i, count):
        """
        Apply mask, compare to expected value, shift and return result.
        Eventually, this could become a ``reduce`` function.

        :param byte: The byte to compare
        :param value: The currently accumulated value.
        :param data: The data buffer, (array of bytes).
        :param i: The position within the data buffer.
        :param count: The position of this comparison.
        :return: A new value with the bits merged in.
        :raises UnicodeDecodeError: if marked bits don't match.
        """
        if byte & self.mask == self.value:
            value <<= self.bits
            value |= byte & self.mask2
        else:
            raise UnicodeDecodeError(
                NAME,
                data,
                i,
                i + count,
                "invalid {}-byte sequence".format(self.count),
            )
        return value

    def __repr__(self):
        return "DecodeMap({})".format(
            ", ".join(
                "{}=0x{:02x}".format(n, getattr(self, n))
                for n in ("count", "mask", "value", "bits", "mask2")
            )
        )


DECODER_MAP = {
    2: ((0xC0, 0x80, 6),),
    3: ((0xC0, 0x80, 6), (0xC0, 0x80, 6)),
    6: (
        (0xF0, 0xA0, 4),
        (0xC0, 0x80, 6),
        (0xFF, 0xED, 0),
        (0xF0, 0xB0, 4),
        (0xC0, 0x80, 6),
    ),
}

DECODE_MAP = dict(
    (k, tuple(DecodeMap(k, *vv) for vv in v)) for k, v in DECODER_MAP.items()
)


def decoder(data):
    """
    This generator processes a sequence of bytes in Modified UTF-8 encoding
    and produces a sequence of unicode string characters.

    It takes bits from the byte until it matches one of the known encoding
    sequences.
    It uses ``DecodeMap`` to mask, compare and generate values.

    :param data: a string of bytes in Modified UTF-8 encoding.
    :return: a generator producing a string of unicode characters
    :raises UnicodeDecodeError: unrecognised byte in sequence encountered.
    """

    def next_byte(_it, start, count):
        try:
            return next(_it)[1]
        except StopIteration:
            raise UnicodeDecodeError(
                NAME, data, start, start + count, "incomplete byte sequence"
            )

    it = iter(enumerate(data))
    for i, d in it:
        if d == 0x00:  # 00000000
            raise UnicodeDecodeError(
                NAME, data, i, i + 1, "embedded zero-byte not allowed"
            )

        if d & 0x80:  # 1xxxxxxx
            if d & 0x40:  # 11xxxxxx
                if d & 0x20:  # 111xxxxx
                    if d & 0x10:  # 1111xxxx
                        raise UnicodeDecodeError(
                            NAME, data, i, i + 1, "invalid encoding character"
                        )

                    if d == 0xED:
                        value = 0
                        for i1, dm in enumerate(DECODE_MAP[6]):
                            d1 = next_byte(it, i, i1 + 1)
                            value = dm.apply(d1, value, data, i, i1 + 1)
                    else:  # 1110xxxx
                        value = d & 0x0F
                        for i1, dm in enumerate(DECODE_MAP[3]):
                            d1 = next_byte(it, i, i1 + 1)
                            value = dm.apply(d1, value, data, i, i1 + 1)
                else:  # 110xxxxx
                    value = d & 0x1F
                    for i1, dm in enumerate(DECODE_MAP[2]):
                        d1 = next_byte(it, i, i1 + 1)
                        value = dm.apply(d1, value, data, i, i1 + 1)
            else:  # 10xxxxxx
                raise UnicodeDecodeError(
                    NAME, data, i, i + 1, "misplaced continuation character"
                )
        else:  # 0xxxxxxx
            value = d
        # noinspection PyCompatibility
        yield mutf8_unichr(value)


def decode_modified_utf8(data, errors="strict"):
    """
    Decodes a sequence of bytes to a unicode text and length using
    Modified UTF-8.
    This function is designed to be used with Python ``codecs`` module.

    :param data: a string of bytes in Modified UTF-8
    :param errors: handle decoding errors
    :return: unicode text and length
    :raises UnicodeDecodeError: sequence is invalid.
    """
    value, length = "", 0
    it = iter(decoder(byte_to_int(d) for d in data))
    while True:
        try:
            value += next(it)
            length += 1
        except StopIteration:
            break
        except UnicodeDecodeError as e:
            if errors == "strict":
                raise e

            if errors == "ignore":
                pass
            elif errors == "replace":
                value += "\uFFFD"
                length += 1
    return value, length


def mutf8_unichr(value):
    """
    Mimics Python 2 unichr() and Python 3 chr()
    """
    return unicode_char(value)

1	#!/usr/bin/python
2	# -- Content-Encoding: utf-8 --
3	"""	6✔
4	Implements the support of the Java-specific kind of UTF-8 encoding.
5
6	This module is a modified version of ``py2jdbc.mutf8`` provided by
7	`@guywithface <https://github.com/guywithface>`_.
8
9	The project the original file comes from is available at:
10	https://github.com/swstephe/py2jdbc/
11
12	:authors: Scott Stephens (@swstephe), @guywithface
13	:license: Apache License 2.0
14	:version: 0.4.4
15	:status: Alpha
16	"""
17
18	from __future__ import unicode_literals	6✔
19
20	import sys	6✔
21
22
23	# Module version
24	__version_info__ = (0, 4, 4)	6✔
25	__version__ = ".".join(str(x) for x in __version_info__)	6✔
26
27	# Documentation strings format
28	__docformat__ = "restructuredtext en"	6✔
29
30	# Encoding name: not cesu-8, which uses a different zero-byte
31	NAME = "mutf8"	6✔
32
33	# ------------------------------------------------------------------------------
34
35	if sys.version_info[0] >= 3:	6✔
36	unicode_char = chr # pylint:disable=C0103	6✔
37
38	def byte_to_int(data):	6✔
39	# type: (bytes) -> int
40	"""
41	Converts the first byte of the given data to an integer
42	"""
43	if isinstance(data, int):	6✔
44	return data	6✔
45
46	if isinstance(data, bytes):	×
47	return data[0]	×
48
49	raise ValueError(	×
50	"Expected byte or int as input, got: {0}".format(
51	type(data).__name__
52	)
53	)
54
55
56	else:
57	unicode_char = (	×
58	unichr # pylint:disable=C0103,undefined-variable # noqa: F821
59	)
60
61	def byte_to_int(data):	×
62	# type: (bytes) -> int
63	"""
64	Converts the first byte of the given data to an integer
65	"""
66	if isinstance(data, int):	×
67	return data	×
68
69	if isinstance(data, str):	×
70	return ord(data[0])	×
71
72	raise ValueError(	×
73	"Expected byte or int as input, got: {0}".format(
74	type(data).__name__
75	)
76	)
77
78
79	# ------------------------------------------------------------------------------
80
81
82	class DecodeMap(object): # pylint:disable=R0205	6✔
83	"""
84	A utility class which manages masking, comparing and mapping in bits.
85	If the mask and compare fails, this will raise UnicodeDecodeError so
86	encode and decode will correctly handle bad characters.
87	"""
88
89	def __init__(self, count, mask, value, bits):	6✔
90	"""
91	Initialize a DecodeMap, entry from a static dictionary for the module.
92	It automatically calculates the mask for the bits for the value
93	(always assumed to be at the bottom of the byte).
94
95	:param count: The number of bytes in this entire sequence.
96	:param mask: The mask to apply to the byte at this position.
97	:param value: The value of masked bits, (without shifting).
98	:param bits: The number of bits.
99	"""
100	self.count = count	6✔
101	self.mask = mask	6✔
102	self.value = value	6✔
103	self.bits = bits	6✔
104	self.mask2 = (1 << bits) - 1	6✔
105
106	def apply(self, byte, value, data, i, count):	6✔
107	"""
108	Apply mask, compare to expected value, shift and return result.
109	Eventually, this could become a ``reduce`` function.
110
111	:param byte: The byte to compare
112	:param value: The currently accumulated value.
113	:param data: The data buffer, (array of bytes).
114	:param i: The position within the data buffer.
115	:param count: The position of this comparison.
116	:return: A new value with the bits merged in.
117	:raises UnicodeDecodeError: if marked bits don't match.
118	"""
119	if byte & self.mask == self.value:	6✔
120	value <<= self.bits	6✔
121	value \|= byte & self.mask2	6✔
122	else:
123	raise UnicodeDecodeError(	×
124	NAME,
125	data,
126	i,
127	i + count,
128	"invalid {}-byte sequence".format(self.count),
129	)
130	return value	6✔
131
132	def __repr__(self):	6✔
133	return "DecodeMap({})".format(	×
134	", ".join(
135	"{}=0x{:02x}".format(n, getattr(self, n))
136	for n in ("count", "mask", "value", "bits", "mask2")
137	)
138	)
139
140
141	DECODER_MAP = {	6✔
142	2: ((0xC0, 0x80, 6),),
143	3: ((0xC0, 0x80, 6), (0xC0, 0x80, 6)),
144	6: (
145	(0xF0, 0xA0, 4),
146	(0xC0, 0x80, 6),
147	(0xFF, 0xED, 0),
148	(0xF0, 0xB0, 4),
149	(0xC0, 0x80, 6),
150	),
151	}
152
153	DECODE_MAP = dict(	6✔
154	(k, tuple(DecodeMap(k, *vv) for vv in v)) for k, v in DECODER_MAP.items()
155	)
156
157
158	def decoder(data):	6✔
159	"""
160	This generator processes a sequence of bytes in Modified UTF-8 encoding
161	and produces a sequence of unicode string characters.
162
163	It takes bits from the byte until it matches one of the known encoding
164	sequences.
165	It uses ``DecodeMap`` to mask, compare and generate values.
166
167	:param data: a string of bytes in Modified UTF-8 encoding.
168	:return: a generator producing a string of unicode characters
169	:raises UnicodeDecodeError: unrecognised byte in sequence encountered.
170	"""
171
172	def next_byte(_it, start, count):	6✔
173	try:	6✔
174	return next(_it)[1]	6✔
175	except StopIteration:	×
176	raise UnicodeDecodeError(	×
177	NAME, data, start, start + count, "incomplete byte sequence"
178	)
179
180	it = iter(enumerate(data))	6✔
181	for i, d in it:	6✔
182	if d == 0x00: # 00000000	6✔
183	raise UnicodeDecodeError(	×
184	NAME, data, i, i + 1, "embedded zero-byte not allowed"
185	)
186
187	if d & 0x80: # 1xxxxxxx	6✔
188	if d & 0x40: # 11xxxxxx	6✔
189	if d & 0x20: # 111xxxxx	6✔
190	if d & 0x10: # 1111xxxx	6✔
191	raise UnicodeDecodeError(	×
192	NAME, data, i, i + 1, "invalid encoding character"
193	)
194
195	if d == 0xED:	6✔
196	value = 0	×
197	for i1, dm in enumerate(DECODE_MAP[6]):	×
198	d1 = next_byte(it, i, i1 + 1)	×
199	value = dm.apply(d1, value, data, i, i1 + 1)	×
200	else: # 1110xxxx
201	value = d & 0x0F	6✔
202	for i1, dm in enumerate(DECODE_MAP[3]):	6✔
203	d1 = next_byte(it, i, i1 + 1)	6✔
204	value = dm.apply(d1, value, data, i, i1 + 1)	6✔
205	else: # 110xxxxx
206	value = d & 0x1F	×
207	for i1, dm in enumerate(DECODE_MAP[2]):	×
208	d1 = next_byte(it, i, i1 + 1)	×
209	value = dm.apply(d1, value, data, i, i1 + 1)	×
210	else: # 10xxxxxx
211	raise UnicodeDecodeError(	×
212	NAME, data, i, i + 1, "misplaced continuation character"
213	)
214	else: # 0xxxxxxx
215	value = d	6✔
216	# noinspection PyCompatibility
217	yield mutf8_unichr(value)	6✔
218
219
220	def decode_modified_utf8(data, errors="strict"):	6✔
221	"""
222	Decodes a sequence of bytes to a unicode text and length using
223	Modified UTF-8.
224	This function is designed to be used with Python ``codecs`` module.
225
226	:param data: a string of bytes in Modified UTF-8
227	:param errors: handle decoding errors
228	:return: unicode text and length
229	:raises UnicodeDecodeError: sequence is invalid.
230	"""
231	value, length = "", 0	6✔
232	it = iter(decoder(byte_to_int(d) for d in data))	6✔
233	while True:	4✔
234	try:	6✔
235	value += next(it)	6✔
236	length += 1	6✔
237	except StopIteration:	6✔
238	break	6✔
239	except UnicodeDecodeError as e:	×
240	if errors == "strict":	×
241	raise e	×
242
243	if errors == "ignore":	×
244	pass	×
245	elif errors == "replace":	×
246	value += "\uFFFD"	×
247	length += 1	×
248	return value, length	6✔
249
250
251	def mutf8_unichr(value):	6✔
252	"""
253	Mimics Python 2 unichr() and Python 3 chr()
254	"""
255	return unicode_char(value)	6✔

tcalmant / python-javaobj / 8591112367

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous