11640651655

Committed 02 Nov 2024 06:38AM UTC coverage: 33.168%. First build

Build # 11640651655

Build Type

Pull #962

github

Committed by

web-flow

Commit Message

Merge 22aa4c3d6 into 1c9a2432a

Pull Request Pull Request #962: Fix expand maiyamok

Run Details

25 of 26 new or added lines in 1 file covered. (96.15%)

2479 of 7474 relevant lines covered (33.17%)

3.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.55

/pythainlp/util/normalize.py

﻿# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Text normalization
"""

import re
from typing import List, Union

from pythainlp import thai_above_vowels as above_v
from pythainlp import thai_below_vowels as below_v
from pythainlp import thai_follow_vowels as follow_v
from pythainlp import thai_lead_vowels as lead_v
from pythainlp import thai_tonemarks as tonemarks
from pythainlp.tokenize import word_tokenize
from pythainlp.tools import warn_deprecation

_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+")

_ZERO_WIDTH_CHARS = "\u200b\u200c"  # ZWSP, ZWNJ

_REORDER_PAIRS = [
    ("\u0e40\u0e40", "\u0e41"),  # Sara E + Sara E -> Sara Ae
    (
        f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)",
        "\\2\\1",
    ),  # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat
    (
        f"\u0e4d([{tonemarks}]*)\u0e32",
        "\\1\u0e33",
    ),  # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am
    (
        f"([{follow_v}]+)([{tonemarks}]+)",
        "\\2\\1",
    ),  # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL
    ("([^\u0e24\u0e26])\u0e45", "\\1\u0e32"),  # Lakkhangyao -> Sara Aa
]

# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
_NOREPEAT_CHARS = (
    f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
)
_NOREPEAT_PAIRS = list(
    zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
)

_RE_TONEMARKS = re.compile(f"[{tonemarks}]+")

_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")


def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
    return matchobj.group(0)[-1]


def remove_dangling(text: str) -> str:
    """
    Remove Thai non-base characters at the beginning of text.

    This is a common "typo", especially for input field in a form,
    as these non-base characters can be visually hidden from user
    who may accidentally typed them in.

    A character to be removed should be both:

        * tone mark, above vowel, below vowel, or non-base sign AND
        * located at the beginning of the text

    :param str text: input text
    :return: text without dangling Thai characters at the beginning
    :rtype: str

    :Example:
    ::

        from pythainlp.util import remove_dangling

        remove_dangling("๊ก")
        # output: 'ก'
    """
    return _RE_REMOVE_DANGLINGS.sub("", text)


def remove_dup_spaces(text: str) -> str:
    """
    Remove duplicate spaces. Replace multiple spaces with one space.

    Multiple newline characters and empty lines will be replaced
    with one newline character.

    :param str text: input text
    :return: text without duplicated spaces and newlines
    :rtype: str

    :Example:
    ::

        from pythainlp.util import remove_dup_spaces

        remove_dup_spaces("ก    ข    ค")
        # output: 'ก ข ค'
    """
    while "  " in text:
        text = text.replace("  ", " ")
    text = _RE_REMOVE_NEWLINES.sub("\n", text)
    text = text.strip()
    return text


def remove_tonemark(text: str) -> str:
    """
    Remove all Thai tone marks from the text.

    Thai script has four tone marks indicating four tones as follows:

        * Down tone (Thai: ไม้เอก  _่ )
        * Falling tone  (Thai: ไม้โท  _้ )
        * High tone (Thai: ไม้ตรี  _๊ )
        * Rising tone (Thai: ไม้จัตวา _๋ )

    Putting wrong tone mark is a common mistake in Thai writing.
    By removing tone marks from the string, it could be used to
    for a approximate string matching.

    :param str text: input text
    :return: text without Thai tone marks
    :rtype: str

    :Example:
    ::

        from pythainlp.util import remove_tonemark

        remove_tonemark("สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด")
        # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
    """
    for ch in tonemarks:
        while ch in text:
            text = text.replace(ch, "")
    return text


def remove_zw(text: str) -> str:
    """
    Remove zero-width characters.

    These non-visible characters may cause unexpected result from the
    user's point of view. Removing them can make string matching more robust.

    Characters to be removed:

        * Zero-width space (ZWSP)
        * Zero-width non-joiner (ZWJP)

    :param str text: input text
    :return: text without zero-width characters
    :rtype: str
    """
    for ch in _ZERO_WIDTH_CHARS:
        while ch in text:
            text = text.replace(ch, "")

    return text


def reorder_vowels(text: str) -> str:
    """
    Reorder vowels and tone marks to the standard logical order/spelling.

    Characters in input text will be reordered/transformed,
    according to these rules:

        * Sara E + Sara E -> Sara Ae
        * Nikhahit + Sara Aa -> Sara Am
        * tone mark + non-base vowel -> non-base vowel + tone mark
        * follow vowel + tone mark -> tone mark + follow vowel

    :param str text: input text
    :return: text with vowels and tone marks in the standard logical order
    :rtype: str
    """
    for pair in _REORDER_PAIRS:
        text = re.sub(pair[0], pair[1], text)

    return text


def remove_repeat_vowels(text: str) -> str:
    """
    Remove repeating vowels, tone marks, and signs.

    This function will call reorder_vowels() first, to make sure that
    double Sara E will be converted to Sara Ae and not be removed.

    :param str text: input text
    :return: text without repeating Thai vowels, tone marks, and signs
    :rtype: str
    """
    text = reorder_vowels(text)
    for pair in _NOREPEAT_PAIRS:
        text = re.sub(pair[0], pair[1], text)

    # remove repeating tone marks, use last tone mark
    text = _RE_TONEMARKS.sub(_last_char, text)

    return text


def normalize(text: str) -> str:
    """
    Normalize and clean Thai text with normalizing rules as follows:

        * Remove zero-width spaces
        * Remove duplicate spaces
        * Reorder tone marks and vowels to standard order/spelling
        * Remove duplicate vowels and signs
        * Remove duplicate tone marks
        * Remove dangling non-base characters at the beginning of text

    normalize() simply call remove_zw(), remove_dup_spaces(),
    remove_repeat_vowels(), and remove_dangling(), in that order.

    If a user wants to customize the selection or the order of rules
    to be applied, they can choose to call those functions by themselves.

    Note: for Unicode normalization, see unicodedata.normalize().

    :param str text: input text
    :return: normalized text according to the rules
    :rtype: str

    :Example:
    ::

        from pythainlp.util import normalize

        normalize("เเปลก")  # starts with two Sara E
        # output: แปลก

        normalize("นานาาา")
        # output: นานา
    """
    text = remove_zw(text)
    text = remove_dup_spaces(text)
    text = remove_repeat_vowels(text)
    text = remove_dangling(text)

    return text


def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
    if isinstance(sent, str):
        sent = word_tokenize(sent)

    # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
    temp_toks: list[str] = []
    for _, token in enumerate(sent):
        toks = re.split(r"(ๆ)", token)
        toks = [tok for tok in toks if tok]  # remove empty string ("")
        temp_toks.extend(toks)
    sent = temp_toks

    output_toks: list[str] = []

    yamok = "ๆ"
    yamok_count = 0
    len_sent = len(sent)
    for i in range(len_sent - 1, -1, -1):  # do it backward
        print(i, sent[i])
        print(i, output_toks)
        if yamok_count == 0 or (i + 1 >= len_sent):
            if sent[i] == yamok:
                yamok_count = yamok_count + 1
            else:
                output_toks.append(sent[i])
            continue

        if sent[i] == yamok:
            yamok_count = yamok_count + 1
        else:
            if sent[i].isspace():
                if yamok_count > 0:  # remove space before yamok
                    continue
                else:  # with preprocessing above, this should not happen
                    output_toks.append(sent[i])
            else:
                output_toks.extend([sent[i]] * (yamok_count + 1))
                yamok_count = 0

    return output_toks[::-1]


def maiyamok(sent: Union[str, List[str]]) -> List[str]:
    """
    Expand Maiyamok.

    Deprecated. Use expand_maiyamok() instead.

    Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
    repetition. This function preprocesses Thai text by replacing
    Maiyamok with a word being repeated.

    :param Union[str, List[str]] sent: input sentence (list or str)
    :return: list of words
    :rtype: List[str]

    :Example:
    ::

        from pythainlp.util import expand_maiyamok

        expand_maiyamok("เด็กๆกิน")
        # output: ['เด็ก', 'เด็ก', 'กิน']
    """
    warn_deprecation(
        "pythainlp.util.maiyamok", "pythainlp.util.expand_maiyamok"
    )
    return expand_maiyamok(sent)

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3	# SPDX-License-Identifier: Apache-2.0
4	"""	2✔
5	Text normalization
6	"""
7
8	import re	10✔
9	from typing import List, Union	10✔
10
11	from pythainlp import thai_above_vowels as above_v	10✔
12	from pythainlp import thai_below_vowels as below_v	10✔
13	from pythainlp import thai_follow_vowels as follow_v	10✔
14	from pythainlp import thai_lead_vowels as lead_v	10✔
15	from pythainlp import thai_tonemarks as tonemarks	10✔
16	from pythainlp.tokenize import word_tokenize	10✔
17	from pythainlp.tools import warn_deprecation	10✔
18
19	_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"	10✔
20	_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+")	10✔
21
22	_ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ	10✔
23
24	_REORDER_PAIRS = [	10✔
25	("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae
26	(
27	f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)",
28	"\\2\\1",
29	), # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat
30	(
31	f"\u0e4d([{tonemarks}]*)\u0e32",
32	"\\1\u0e33",
33	), # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am
34	(
35	f"([{follow_v}]+)([{tonemarks}]+)",
36	"\\2\\1",
37	), # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL
38	("([^\u0e24\u0e26])\u0e45", "\\1\u0e32"), # Lakkhangyao -> Sara Aa
39	]
40
41	# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
42	_NOREPEAT_CHARS = (	10✔
43	f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
44	)
45	_NOREPEAT_PAIRS = list(	10✔
46	zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
47	)
48
49	_RE_TONEMARKS = re.compile(f"[{tonemarks}]+")	10✔
50
51	_RE_REMOVE_NEWLINES = re.compile("[ \n]\n[ \n]")	10✔
52
53
54	def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS	10✔
55	return matchobj.group(0)[-1]	10✔
56
57
58	def remove_dangling(text: str) -> str:	10✔
59	"""
60	Remove Thai non-base characters at the beginning of text.
61
62	This is a common "typo", especially for input field in a form,
63	as these non-base characters can be visually hidden from user
64	who may accidentally typed them in.
65
66	A character to be removed should be both:
67
68	* tone mark, above vowel, below vowel, or non-base sign AND
69	* located at the beginning of the text
70
71	:param str text: input text
72	:return: text without dangling Thai characters at the beginning
73	:rtype: str
74
75	:Example:
76	::
77
78	from pythainlp.util import remove_dangling
79
80	remove_dangling("๊ก")
81	# output: 'ก'
82	"""
83	return _RE_REMOVE_DANGLINGS.sub("", text)	10✔
84
85
86	def remove_dup_spaces(text: str) -> str:	10✔
87	"""
88	Remove duplicate spaces. Replace multiple spaces with one space.
89
90	Multiple newline characters and empty lines will be replaced
91	with one newline character.
92
93	:param str text: input text
94	:return: text without duplicated spaces and newlines
95	:rtype: str
96
97	:Example:
98	::
99
100	from pythainlp.util import remove_dup_spaces
101
102	remove_dup_spaces("ก ข ค")
103	# output: 'ก ข ค'
104	"""
105	while " " in text:	10✔
106	text = text.replace(" ", " ")	10✔
107	text = _RE_REMOVE_NEWLINES.sub("\n", text)	10✔
108	text = text.strip()	10✔
109	return text	10✔
110
111
112	def remove_tonemark(text: str) -> str:	10✔
113	"""
114	Remove all Thai tone marks from the text.
115
116	Thai script has four tone marks indicating four tones as follows:
117
118	* Down tone (Thai: ไม้เอก _่ )
119	* Falling tone (Thai: ไม้โท _้ )
120	* High tone (Thai: ไม้ตรี _๊ )
121	* Rising tone (Thai: ไม้จัตวา _๋ )
122
123	Putting wrong tone mark is a common mistake in Thai writing.
124	By removing tone marks from the string, it could be used to
125	for a approximate string matching.
126
127	:param str text: input text
128	:return: text without Thai tone marks
129	:rtype: str
130
131	:Example:
132	::
133
134	from pythainlp.util import remove_tonemark
135
136	remove_tonemark("สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด")
137	# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
138	"""
139	for ch in tonemarks:	10✔
140	while ch in text:	10✔
141	text = text.replace(ch, "")	10✔
142	return text	10✔
143
144
145	def remove_zw(text: str) -> str:	10✔
146	"""
147	Remove zero-width characters.
148
149	These non-visible characters may cause unexpected result from the
150	user's point of view. Removing them can make string matching more robust.
151
152	Characters to be removed:
153
154	* Zero-width space (ZWSP)
155	* Zero-width non-joiner (ZWJP)
156
157	:param str text: input text
158	:return: text without zero-width characters
159	:rtype: str
160	"""
161	for ch in _ZERO_WIDTH_CHARS:	10✔
162	while ch in text:	10✔
163	text = text.replace(ch, "")	10✔
164
165	return text	10✔
166
167
168	def reorder_vowels(text: str) -> str:	10✔
169	"""
170	Reorder vowels and tone marks to the standard logical order/spelling.
171
172	Characters in input text will be reordered/transformed,
173	according to these rules:
174
175	* Sara E + Sara E -> Sara Ae
176	* Nikhahit + Sara Aa -> Sara Am
177	* tone mark + non-base vowel -> non-base vowel + tone mark
178	* follow vowel + tone mark -> tone mark + follow vowel
179
180	:param str text: input text
181	:return: text with vowels and tone marks in the standard logical order
182	:rtype: str
183	"""
184	for pair in _REORDER_PAIRS:	10✔
185	text = re.sub(pair[0], pair[1], text)	10✔
186
187	return text	10✔
188
189
190	def remove_repeat_vowels(text: str) -> str:	10✔
191	"""
192	Remove repeating vowels, tone marks, and signs.
193
194	This function will call reorder_vowels() first, to make sure that
195	double Sara E will be converted to Sara Ae and not be removed.
196
197	:param str text: input text
198	:return: text without repeating Thai vowels, tone marks, and signs
199	:rtype: str
200	"""
201	text = reorder_vowels(text)	10✔
202	for pair in _NOREPEAT_PAIRS:	10✔
203	text = re.sub(pair[0], pair[1], text)	10✔
204
205	# remove repeating tone marks, use last tone mark
206	text = _RE_TONEMARKS.sub(_last_char, text)	10✔
207
208	return text	10✔
209
210
211	def normalize(text: str) -> str:	10✔
212	"""
213	Normalize and clean Thai text with normalizing rules as follows:
214
215	* Remove zero-width spaces
216	* Remove duplicate spaces
217	* Reorder tone marks and vowels to standard order/spelling
218	* Remove duplicate vowels and signs
219	* Remove duplicate tone marks
220	* Remove dangling non-base characters at the beginning of text
221
222	normalize() simply call remove_zw(), remove_dup_spaces(),
223	remove_repeat_vowels(), and remove_dangling(), in that order.
224
225	If a user wants to customize the selection or the order of rules
226	to be applied, they can choose to call those functions by themselves.
227
228	Note: for Unicode normalization, see unicodedata.normalize().
229
230	:param str text: input text
231	:return: normalized text according to the rules
232	:rtype: str
233
234	:Example:
235	::
236
237	from pythainlp.util import normalize
238
239	normalize("เเปลก") # starts with two Sara E
240	# output: แปลก
241
242	normalize("นานาาา")
243	# output: นานา
244	"""
245	text = remove_zw(text)	10✔
246	text = remove_dup_spaces(text)	10✔
247	text = remove_repeat_vowels(text)	10✔
248	text = remove_dangling(text)	10✔
249
250	return text	10✔
251
252
253	def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:	10✔
254	if isinstance(sent, str):	10✔
255	sent = word_tokenize(sent)	10✔
256
257	# Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
258	temp_toks: list[str] = []	10✔
259	for _, token in enumerate(sent):	10✔
260	toks = re.split(r"(ๆ)", token)	10✔
261	toks = [tok for tok in toks if tok] # remove empty string ("")	10✔
262	temp_toks.extend(toks)	10✔
263	sent = temp_toks	10✔
264
265	output_toks: list[str] = []	10✔
266
267	yamok = "ๆ"	10✔
268	yamok_count = 0	10✔
269	len_sent = len(sent)	10✔
270	for i in range(len_sent - 1, -1, -1): # do it backward	10✔
271	print(i, sent[i])	10✔
272	print(i, output_toks)	10✔
273	if yamok_count == 0 or (i + 1 >= len_sent):	10✔
274	if sent[i] == yamok:	10✔
275	yamok_count = yamok_count + 1	10✔
276	else:
277	output_toks.append(sent[i])	10✔
278	continue	10✔
279
280	if sent[i] == yamok:	10✔
281	yamok_count = yamok_count + 1	10✔
282	else:
283	if sent[i].isspace():	10✔
284	if yamok_count > 0: # remove space before yamok	10✔
285	continue	10✔
286	else: # with preprocessing above, this should not happen
NEW 287	output_toks.append(sent[i])	×
288	else:
289	output_toks.extend([sent[i]] * (yamok_count + 1))	10✔
290	yamok_count = 0	10✔
291
292	return output_toks[::-1]	10✔
293
294
295	def maiyamok(sent: Union[str, List[str]]) -> List[str]:	10✔
296	"""
297	Expand Maiyamok.
298
299	Deprecated. Use expand_maiyamok() instead.
300
301	Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
302	repetition. This function preprocesses Thai text by replacing
303	Maiyamok with a word being repeated.
304
305	:param Union[str, List[str]] sent: input sentence (list or str)
306	:return: list of words
307	:rtype: List[str]
308
309	:Example:
310	::
311
312	from pythainlp.util import expand_maiyamok
313
314	expand_maiyamok("เด็กๆกิน")
315	# output: ['เด็ก', 'เด็ก', 'กิน']
316	"""
317	warn_deprecation(	×
318	"pythainlp.util.maiyamok", "pythainlp.util.expand_maiyamok"
319	)
320	return expand_maiyamok(sent)	×

PyThaiNLP / pythainlp / 11640651655

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous