• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11626163864

01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%
11626163864

Pull #952

github

web-flow
Merge 8f2551bc9 into 89ea62ebc
Pull Request #952: Specify a limited test suite

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

23.53
/pythainlp/util/thai.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
Check if it is Thai text
6
"""
7

8
import string
1✔
9
from typing import Tuple
1✔
10

11
from pythainlp import (
1✔
12
    thai_above_vowels,
13
    thai_below_vowels,
14
    thai_consonants,
15
    thai_digits,
16
    thai_follow_vowels,
17
    thai_lead_vowels,
18
    thai_punctuations,
19
    thai_signs,
20
    thai_tonemarks,
21
    thai_vowels,
22
)
23

24
_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation
1✔
25
_TH_FIRST_CHAR_ASCII = 3584
1✔
26
_TH_LAST_CHAR_ASCII = 3711
1✔
27

28

29
def isthaichar(ch: str) -> bool:
1✔
30
    """Check if a character is a Thai character.
31

32
    :param ch: input character
33
    :type ch: str
34
    :return: True if ch is a Thai character, otherwise False.
35
    :rtype: bool
36

37
    :Example:
38
    ::
39

40
        from pythainlp.util import isthaichar
41

42
        isthaichar("ก")  # THAI CHARACTER KO KAI
43
        # output: True
44

45
        isthaichar("๕")  # THAI DIGIT FIVE
46
        # output: True
47
    """
48
    ch_val = ord(ch)
1✔
49
    if _TH_FIRST_CHAR_ASCII <= ch_val <= _TH_LAST_CHAR_ASCII:
1✔
50
        return True
1✔
51
    return False
1✔
52

53

54
def isthai(text: str, ignore_chars: str = ".") -> bool:
1✔
55
    """Check if every character in a string is a Thai character.
56

57
    :param text: input text
58
    :type text: str
59
    :param ignore_chars: characters to be ignored, defaults to "."
60
    :type ignore_chars: str, optional
61
    :return: True if every character in the input string is Thai,
62
             otherwise False.
63
    :rtype: bool
64

65
    :Example:
66
    ::
67

68
        from pythainlp.util import isthai
69

70
        isthai("กาลเวลา")
71
        # output: True
72

73
        isthai("กาลเวลา.")
74
        # output: True
75

76
        isthai("กาล-เวลา")
77
        # output: False
78

79
        isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,")
80
        # output: True
81

82
    """
83
    if not ignore_chars:
×
84
        ignore_chars = ""
×
85

86
    for ch in text:
×
87
        if ch not in ignore_chars and not isthaichar(ch):
×
88
            return False
×
89
    return True
×
90

91

92
def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float:
1✔
93
    """Find proportion of Thai characters in a given text
94

95
    :param text: input text
96
    :type text: str
97
    :param ignore_chars: characters to be ignored, defaults to whitespace,\\
98
        digits, and punctuation marks.
99
    :type ignore_chars: str, optional
100
    :return: proportion of Thai characters in the text (percentage)
101
    :rtype: float
102

103
    :Example:
104
    ::
105

106
        from pythainlp.util import countthai
107

108
        countthai("ไทยเอ็นแอลพี 3.0")
109
        # output: 100.0
110

111
        countthai("PyThaiNLP 3.0")
112
        # output: 0.0
113

114
        countthai("ใช้งาน PyThaiNLP 3.0")
115
        # output: 40.0
116

117
        countthai("ใช้งาน PyThaiNLP 3.0", ignore_chars="")
118
        # output: 30.0
119
    """
120
    if not text or not isinstance(text, str):
×
121
        return 0.0
×
122

123
    if not ignore_chars:
×
124
        ignore_chars = ""
×
125

126
    num_thai = 0
×
127
    num_ignore = 0
×
128

129
    for ch in text:
×
130
        if ch in ignore_chars:
×
131
            num_ignore += 1
×
132
        elif isthaichar(ch):
×
133
            num_thai += 1
×
134

135
    num_count = len(text) - num_ignore
×
136

137
    if num_count == 0:
×
138
        return 0.0
×
139

140
    return (num_thai / num_count) * 100
×
141

142

143
def display_thai_char(ch: str) -> str:
1✔
144
    """Prefix an underscore (_) to a high-position vowel or a tone mark,
145
    to ease readability.
146

147
    :param ch: input character
148
    :type ch: str
149
    :return: "_" + ch
150
    :rtype: str
151

152
    :Example:
153
    ::
154

155
        from pythainlp.util import display_thai_char
156

157
        display_thai_char("้")
158
        # output: "_้"
159
    """
160

161
    if (
×
162
        ch in thai_above_vowels
163
        or ch in thai_tonemarks
164
        or ch in "\u0e33\u0e4c\u0e4d\u0e4e"
165
    ):
166
        # last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan
167
        return "_" + ch
×
168
    else:
169
        return ch
×
170

171

172
def thai_word_tone_detector(word: str) -> Tuple[str, str]:
1✔
173
    """
174
    Thai tone detector for word.
175

176
    It uses pythainlp.transliterate.pronunciate for converting word to\
177
        pronunciation.
178

179
    :param str word: Thai word.
180
    :return: Thai pronunciation with tones in each syllable.\
181
        (l, m, h, r, f or empty if it cannot be detected)
182
    :rtype: Tuple[str, str]
183

184
    :Example:
185
    ::
186

187
        from pythainlp.util import thai_word_tone_detector
188

189
        print(thai_word_tone_detector("คนดี"))
190
        # output: [('คน', 'm'), ('ดี', 'm')]
191

192
        print(thai_word_tone_detector("มือถือ"))
193
        # output: [('มือ', 'm'), ('ถือ', 'r')]
194
    """
NEW
195
    from ..transliterate import pronunciate
×
NEW
196
    from ..util.syllable import tone_detector
×
197

198
    _pronunciate = pronunciate(word).split("-")
×
199
    return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate]
×
200

201

202
def count_thai_chars(text: str) -> dict:
1✔
203
    """
204
    Count Thai characters by type
205

206
    This function will give you numbers of Thai characters by type\
207
        (consonants, vowels, lead_vowels, follow_vowels, above_vowels,\
208
        below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai)
209

210
    :param str text: Text
211
    :return: Dict with numbers of Thai characters by type
212
    :rtype: dict
213

214
    :Example:
215
    ::
216

217
        from pythainlp.util import count_thai_chars
218

219
        count_thai_chars("ทดสอบภาษาไทย")
220
        # output: {
221
        # 'vowels': 3,
222
        # 'lead_vowels': 1,
223
        # 'follow_vowels': 2,
224
        # 'above_vowels': 0,
225
        # 'below_vowels': 0,
226
        # 'consonants': 9,
227
        # 'tonemarks': 0,
228
        # 'signs': 0,
229
        # 'thai_digits': 0,
230
        # 'punctuations': 0,
231
        # 'non_thai': 0
232
        # }
233
    """
234
    _dict = {
×
235
        "vowels": 0,
236
        "lead_vowels": 0,
237
        "follow_vowels": 0,
238
        "above_vowels": 0,
239
        "below_vowels": 0,
240
        "consonants": 0,
241
        "tonemarks": 0,
242
        "signs": 0,
243
        "thai_digits": 0,
244
        "punctuations": 0,
245
        "non_thai": 0,
246
    }
247
    for c in text:
×
248
        if c in thai_vowels:
×
249
            _dict["vowels"] += 1
×
250
        if c in thai_lead_vowels:
×
251
            _dict["lead_vowels"] += 1
×
252
        elif c in thai_follow_vowels:
×
253
            _dict["follow_vowels"] += 1
×
254
        elif c in thai_above_vowels:
×
255
            _dict["above_vowels"] += 1
×
256
        elif c in thai_below_vowels:
×
257
            _dict["below_vowels"] += 1
×
258
        elif c in thai_consonants:
×
259
            _dict["consonants"] += 1
×
260
        elif c in thai_tonemarks:
×
261
            _dict["tonemarks"] += 1
×
262
        elif c in thai_signs:
×
263
            _dict["signs"] += 1
×
264
        elif c in thai_digits:
×
265
            _dict["thai_digits"] += 1
×
266
        elif c in thai_punctuations:
×
267
            _dict["punctuations"] += 1
×
268
        else:
269
            _dict["non_thai"] += 1
×
270
    return _dict
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc