• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11640651655

02 Nov 2024 06:38AM UTC coverage: 33.168%. First build
11640651655

Pull #962

github

web-flow
Merge 22aa4c3d6 into 1c9a2432a
Pull Request #962: Fix expand maiyamok

25 of 26 new or added lines in 1 file covered. (96.15%)

2479 of 7474 relevant lines covered (33.17%)

3.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.55
/pythainlp/util/normalize.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
2✔
5
Text normalization
6
"""
7

8
import re
10✔
9
from typing import List, Union
10✔
10

11
from pythainlp import thai_above_vowels as above_v
10✔
12
from pythainlp import thai_below_vowels as below_v
10✔
13
from pythainlp import thai_follow_vowels as follow_v
10✔
14
from pythainlp import thai_lead_vowels as lead_v
10✔
15
from pythainlp import thai_tonemarks as tonemarks
10✔
16
from pythainlp.tokenize import word_tokenize
10✔
17
from pythainlp.tools import warn_deprecation
10✔
18

19
_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
10✔
20
_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+")
10✔
21

22
_ZERO_WIDTH_CHARS = "\u200b\u200c"  # ZWSP, ZWNJ
10✔
23

24
_REORDER_PAIRS = [
10✔
25
    ("\u0e40\u0e40", "\u0e41"),  # Sara E + Sara E -> Sara Ae
26
    (
27
        f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)",
28
        "\\2\\1",
29
    ),  # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat
30
    (
31
        f"\u0e4d([{tonemarks}]*)\u0e32",
32
        "\\1\u0e33",
33
    ),  # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am
34
    (
35
        f"([{follow_v}]+)([{tonemarks}]+)",
36
        "\\2\\1",
37
    ),  # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL
38
    ("([^\u0e24\u0e26])\u0e45", "\\1\u0e32"),  # Lakkhangyao -> Sara Aa
39
]
40

41
# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
42
_NOREPEAT_CHARS = (
10✔
43
    f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
44
)
45
_NOREPEAT_PAIRS = list(
10✔
46
    zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
47
)
48

49
_RE_TONEMARKS = re.compile(f"[{tonemarks}]+")
10✔
50

51
_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
10✔
52

53

54
def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
10✔
55
    return matchobj.group(0)[-1]
10✔
56

57

58
def remove_dangling(text: str) -> str:
10✔
59
    """
60
    Remove Thai non-base characters at the beginning of text.
61

62
    This is a common "typo", especially for input field in a form,
63
    as these non-base characters can be visually hidden from user
64
    who may accidentally typed them in.
65

66
    A character to be removed should be both:
67

68
        * tone mark, above vowel, below vowel, or non-base sign AND
69
        * located at the beginning of the text
70

71
    :param str text: input text
72
    :return: text without dangling Thai characters at the beginning
73
    :rtype: str
74

75
    :Example:
76
    ::
77

78
        from pythainlp.util import remove_dangling
79

80
        remove_dangling("๊ก")
81
        # output: 'ก'
82
    """
83
    return _RE_REMOVE_DANGLINGS.sub("", text)
10✔
84

85

86
def remove_dup_spaces(text: str) -> str:
10✔
87
    """
88
    Remove duplicate spaces. Replace multiple spaces with one space.
89

90
    Multiple newline characters and empty lines will be replaced
91
    with one newline character.
92

93
    :param str text: input text
94
    :return: text without duplicated spaces and newlines
95
    :rtype: str
96

97
    :Example:
98
    ::
99

100
        from pythainlp.util import remove_dup_spaces
101

102
        remove_dup_spaces("ก    ข    ค")
103
        # output: 'ก ข ค'
104
    """
105
    while "  " in text:
10✔
106
        text = text.replace("  ", " ")
10✔
107
    text = _RE_REMOVE_NEWLINES.sub("\n", text)
10✔
108
    text = text.strip()
10✔
109
    return text
10✔
110

111

112
def remove_tonemark(text: str) -> str:
10✔
113
    """
114
    Remove all Thai tone marks from the text.
115

116
    Thai script has four tone marks indicating four tones as follows:
117

118
        * Down tone (Thai: ไม้เอก  _่ )
119
        * Falling tone  (Thai: ไม้โท  _้ )
120
        * High tone (Thai: ไม้ตรี  _๊ )
121
        * Rising tone (Thai: ไม้จัตวา _๋ )
122

123
    Putting wrong tone mark is a common mistake in Thai writing.
124
    By removing tone marks from the string, it could be used to
125
    for a approximate string matching.
126

127
    :param str text: input text
128
    :return: text without Thai tone marks
129
    :rtype: str
130

131
    :Example:
132
    ::
133

134
        from pythainlp.util import remove_tonemark
135

136
        remove_tonemark("สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด")
137
        # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
138
    """
139
    for ch in tonemarks:
10✔
140
        while ch in text:
10✔
141
            text = text.replace(ch, "")
10✔
142
    return text
10✔
143

144

145
def remove_zw(text: str) -> str:
10✔
146
    """
147
    Remove zero-width characters.
148

149
    These non-visible characters may cause unexpected result from the
150
    user's point of view. Removing them can make string matching more robust.
151

152
    Characters to be removed:
153

154
        * Zero-width space (ZWSP)
155
        * Zero-width non-joiner (ZWJP)
156

157
    :param str text: input text
158
    :return: text without zero-width characters
159
    :rtype: str
160
    """
161
    for ch in _ZERO_WIDTH_CHARS:
10✔
162
        while ch in text:
10✔
163
            text = text.replace(ch, "")
10✔
164

165
    return text
10✔
166

167

168
def reorder_vowels(text: str) -> str:
10✔
169
    """
170
    Reorder vowels and tone marks to the standard logical order/spelling.
171

172
    Characters in input text will be reordered/transformed,
173
    according to these rules:
174

175
        * Sara E + Sara E -> Sara Ae
176
        * Nikhahit + Sara Aa -> Sara Am
177
        * tone mark + non-base vowel -> non-base vowel + tone mark
178
        * follow vowel + tone mark -> tone mark + follow vowel
179

180
    :param str text: input text
181
    :return: text with vowels and tone marks in the standard logical order
182
    :rtype: str
183
    """
184
    for pair in _REORDER_PAIRS:
10✔
185
        text = re.sub(pair[0], pair[1], text)
10✔
186

187
    return text
10✔
188

189

190
def remove_repeat_vowels(text: str) -> str:
10✔
191
    """
192
    Remove repeating vowels, tone marks, and signs.
193

194
    This function will call reorder_vowels() first, to make sure that
195
    double Sara E will be converted to Sara Ae and not be removed.
196

197
    :param str text: input text
198
    :return: text without repeating Thai vowels, tone marks, and signs
199
    :rtype: str
200
    """
201
    text = reorder_vowels(text)
10✔
202
    for pair in _NOREPEAT_PAIRS:
10✔
203
        text = re.sub(pair[0], pair[1], text)
10✔
204

205
    # remove repeating tone marks, use last tone mark
206
    text = _RE_TONEMARKS.sub(_last_char, text)
10✔
207

208
    return text
10✔
209

210

211
def normalize(text: str) -> str:
10✔
212
    """
213
    Normalize and clean Thai text with normalizing rules as follows:
214

215
        * Remove zero-width spaces
216
        * Remove duplicate spaces
217
        * Reorder tone marks and vowels to standard order/spelling
218
        * Remove duplicate vowels and signs
219
        * Remove duplicate tone marks
220
        * Remove dangling non-base characters at the beginning of text
221

222
    normalize() simply call remove_zw(), remove_dup_spaces(),
223
    remove_repeat_vowels(), and remove_dangling(), in that order.
224

225
    If a user wants to customize the selection or the order of rules
226
    to be applied, they can choose to call those functions by themselves.
227

228
    Note: for Unicode normalization, see unicodedata.normalize().
229

230
    :param str text: input text
231
    :return: normalized text according to the rules
232
    :rtype: str
233

234
    :Example:
235
    ::
236

237
        from pythainlp.util import normalize
238

239
        normalize("เเปลก")  # starts with two Sara E
240
        # output: แปลก
241

242
        normalize("นานาาา")
243
        # output: นานา
244
    """
245
    text = remove_zw(text)
10✔
246
    text = remove_dup_spaces(text)
10✔
247
    text = remove_repeat_vowels(text)
10✔
248
    text = remove_dangling(text)
10✔
249

250
    return text
10✔
251

252

253
def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
10✔
254
    if isinstance(sent, str):
10✔
255
        sent = word_tokenize(sent)
10✔
256

257
    # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
258
    temp_toks: list[str] = []
10✔
259
    for _, token in enumerate(sent):
10✔
260
        toks = re.split(r"(ๆ)", token)
10✔
261
        toks = [tok for tok in toks if tok]  # remove empty string ("")
10✔
262
        temp_toks.extend(toks)
10✔
263
    sent = temp_toks
10✔
264

265
    output_toks: list[str] = []
10✔
266

267
    yamok = "ๆ"
10✔
268
    yamok_count = 0
10✔
269
    len_sent = len(sent)
10✔
270
    for i in range(len_sent - 1, -1, -1):  # do it backward
10✔
271
        print(i, sent[i])
10✔
272
        print(i, output_toks)
10✔
273
        if yamok_count == 0 or (i + 1 >= len_sent):
10✔
274
            if sent[i] == yamok:
10✔
275
                yamok_count = yamok_count + 1
10✔
276
            else:
277
                output_toks.append(sent[i])
10✔
278
            continue
10✔
279

280
        if sent[i] == yamok:
10✔
281
            yamok_count = yamok_count + 1
10✔
282
        else:
283
            if sent[i].isspace():
10✔
284
                if yamok_count > 0:  # remove space before yamok
10✔
285
                    continue
10✔
286
                else:  # with preprocessing above, this should not happen
NEW
287
                    output_toks.append(sent[i])
×
288
            else:
289
                output_toks.extend([sent[i]] * (yamok_count + 1))
10✔
290
                yamok_count = 0
10✔
291

292
    return output_toks[::-1]
10✔
293

294

295
def maiyamok(sent: Union[str, List[str]]) -> List[str]:
10✔
296
    """
297
    Expand Maiyamok.
298

299
    Deprecated. Use expand_maiyamok() instead.
300

301
    Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
302
    repetition. This function preprocesses Thai text by replacing
303
    Maiyamok with a word being repeated.
304

305
    :param Union[str, List[str]] sent: input sentence (list or str)
306
    :return: list of words
307
    :rtype: List[str]
308

309
    :Example:
310
    ::
311

312
        from pythainlp.util import expand_maiyamok
313

314
        expand_maiyamok("เด็กๆกิน")
315
        # output: ['เด็ก', 'เด็ก', 'กิน']
316
    """
317
    warn_deprecation(
×
318
        "pythainlp.util.maiyamok", "pythainlp.util.expand_maiyamok"
319
    )
320
    return expand_maiyamok(sent)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc