• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11650265226

03 Nov 2024 09:36AM UTC coverage: 16.606% (-30.1%) from 46.693%
11650265226

Pull #976

github

web-flow
Merge e37f5df5b into 29586320b
Pull Request #976: Add Compact Tests (testc)

1240 of 7467 relevant lines covered (16.61%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

22.22
/pythainlp/util/remove_trailing_repeat_consonants.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
3✔
5
Removement of repeated consonants at the end of words
6
"""
7
from typing import Iterable, List, Tuple
6✔
8

9
from pythainlp import thai_consonants as consonants
6✔
10
from pythainlp.corpus import thai_words
6✔
11
from pythainlp.util.trie import Trie
6✔
12

13
# used by remove_trailing_repeat_consonants()
14
# contains all words that has repeating consonants at the end
15
# for each consonant
16
# when dictionary updated, this should be updated too
17
# key: consonant
18
# value: list of words that has repeating consonants at the end
19
last_consonants_repeaters = {}
6✔
20

21

22
def remove_trailing_repeat_consonants(
6✔
23
    text: str,
24
    custom_dict: Iterable[str] = [],
25
    has_dictionary_updated: bool = True,
26
) -> str:
27
    """
28
    Remove repeating consonants at the last of the sentence.
29

30
    This function will remove the repeating consonants
31
    before a whitespace, new line or at the last
32
    so that the last word matches a word in the given dictionary.
33
    If there is no match, the repeating consonants will be
34
    reduced to one.
35
    If there are several match, the longest word will be used.
36
    Since this function uses a dictionary, the result may differs
37
    depending on the dictionary used.
38
    Plus, it is recommended to use normalize() to have a better result.
39

40
    :param str text: input text
41
    :param Trie dictionary: Trie dictionary to check the last word.
42
    If None, pythainlp.corpus.thai_words() will be used
43
    :param bool has_dictionary_updated: If the dictionary is updated 
44
    or the first time using in the kernel, set this true.
45
    If not, set this false to save time.
46
    :return: text without repeating Thai consonants
47
    :rtype: str
48

49
    :Example:
50
    ::
51

52
        from pythainlp.util import remove_trailing_repeat_consonants
53
        from pythainlp.util import dict_trie
54

55
        # use default dictionary (pythainlp.corpus.thai_words())
56
        remove_trailing_repeat_consonants('เริ่ดดดดดดดด')
57
        # output: เริ่ด
58

59
        remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม')
60
        # output: อืมมม
61
        # "อืมมม" is in the default dictionary
62

63
        # use custom dictionary
64
        custom_dict = dict_trie(["อืมมมมม"])
65
        remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dict)
66
        # output: อืมมมมม
67

68
        # long text
69
        remove_trailing_repeat_consonants('อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '\
70
        'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ')
71
        # output: อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ
72
        #         นี่เป็นความลับ
73
    """
74
    # use default dictionary if not given
75
    if not custom_dict:
×
76
        custom_dict = thai_words()
×
77

78
    # update repeaters dictionary if not updated
79
    if has_dictionary_updated:
×
80
        _update_consonant_repeaters(custom_dict)
×
81

82
    # seperate by newline
83
    modified_lines = []
×
84
    for line in text.split("\n"):
×
85
        segments = line.split(" ")
×
86

87
        for cnt, segment in enumerate(segments):
×
88
            segments[cnt] = _remove_repeat_trailing_consonants_from_segment(
×
89
                segment
90
            )
91

92
        # revert spaces
93
        modified_line = " ".join(segments)
×
94
        modified_lines.append(modified_line)
×
95

96
    # revert newlines
97
    modified_text = "\n".join(modified_lines)
×
98

99
    return modified_text
×
100

101

102
def _remove_repeat_trailing_consonants_from_segment(segment: str) -> str:
6✔
103
    """
104
    Remove repeating consonants at the last of the segment.
105

106
    This function process only at the last of the given text.
107
    Details is same as remove_repeat_consonants().
108

109
    :param str segment: segment of text
110
    :return: segment without repeating Thai consonants
111
    :rtype: str
112
    """
113
    # skip if the segment is not the target
114
    if not (
×
115
        # the segment is long enough
116
        (len(segment) > 1)
117
        # last is Thai consonant
118
        and (segment[-1] in consonants)
119
        # has repiitition
120
        and (segment[-1] == segment[-2])
121
    ):
122
        # no need to process
123
        return segment
×
124

125
    # duplicating character
126
    dup = segment[-1]
×
127

128
    # find the words that has 2 or more duplication of
129
    # this character at the end.
130
    repeaters = last_consonants_repeaters[dup]
×
131

132
    # remove all of the last repeating character
133
    segment_head = _remove_all_last_consonants(segment, dup)
×
134

135
    # find the longest word that matches the segment
136
    longest_word, repetition = _find_longest_consonant_repeaters_match(
×
137
        segment_head, repeaters
138
    )
139

140
    if len(longest_word) > 0:
×
141
        # if there is a match, use it
142
        segment = segment_head + (dup * repetition)
×
143
    else:
144
        # if none found,
145
        # the chance is that the correct is one character,
146
        # or it's not in the dictionary.
147

148
        # make the repition to once
149
        segment = segment_head + (dup * 1)
×
150

151
    return segment
×
152

153

154
def _remove_all_last_consonants(text: str, dup: str) -> str:
6✔
155
    """
156
    Reduce repeating characters at the end of the text.
157

158
    This function will remove the repeating characters at the last.
159
    The text just before the repeating characters will be returned.
160

161
    :param str text: input text
162
    :param str dup: repeating character to be removed
163
    :return: text without repeating characters at the end
164
    :rtype: str
165
    """
166
    removed = text
×
167
    while (len(removed) > 0) and (removed[-1] == dup):
×
168
        removed = removed[:-1]
×
169

170
    return removed
×
171

172

173
def _update_consonant_repeaters(custom_dict: Iterable[str]) -> None:
6✔
174
    """
175
    Update dictionary of all words that has
176
    repeating consonants at the end from the dictionary.
177

178
    Search all words in the dictionary that has more than 1 consonants
179
    repeating at the end and store them in the global dictionary.
180

181
    :param str consonant: consonant to be searched
182
    :param Trie dictionary: Trie dictionary to search
183
    :rtype: None
184
    """
185
    # initialize dictionary
186
    for consonant in list(consonants):
×
187
        last_consonants_repeaters[consonant] = []
×
188

189
    # register
190
    for word in custom_dict:
×
191
        if _is_last_consonant_repeater(word):
×
192
            last_consonants_repeaters[word[-1]].append(word)
×
193

194
    return
×
195

196

197
def _is_last_consonant_repeater(word: str) -> bool:
6✔
198
    """
199
    Check if the word has repeating consonants at the end.
200

201
    This function checks if the word has
202
    more than 1 repeating consonants at the end.
203

204
    :param str word: word to be checked
205
    :return: True if the word has repeating consonants at the end.
206
    :rtype: bool
207
    """
208
    return (
×
209
        (len(word) > 1) and (word[-1] == word[-2]) and (word[-1] in consonants)
210
    )
211

212

213
def _find_longest_consonant_repeaters_match(
6✔
214
    segment_head: str, repeaters: List[str]
215
) -> Tuple[str, int]:
216
    """
217
    Find the longest word that matches the segment.
218

219
    Find the longest word that matches the last
220
    of the segment from the given repeaters list.
221
    This returns the word and
222
    how much the last character is repeated correctly.
223

224
    :param str segment: segment of text
225
    :param List[str] repeaters: list of words
226
    that has repeating consonants at the end
227
    :return: "tuple of the word" and
228
    "how much the last character is repeated correctly"
229
    If none, ("", 0) will be returned.
230
    :rtype: Tuple[str, int]
231
    """
232
    longest_word = ""  # the longest word that matches the segment
×
233
    repetition = 0  # how much the last character is repeated correctly
×
234
    for repeater in repeaters:
×
235
        # remove all of the last repeating character
236
        repeater_head = _remove_all_last_consonants(repeater, repeater[-1])
×
237

238
        # check match
239
        if (
×
240
            (len(segment_head) >= len(repeater_head))
241
            and (segment_head[-len(repeater_head) :] == repeater_head)
242
            # matched confirmed, check it's longer
243
            and (len(repeater) > len(longest_word))
244
        ):
245
            longest_word = repeater
×
246
            repetition = len(repeater) - len(repeater_head)
×
247

248
    return longest_word, repetition
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc