• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11626163864

01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%
11626163864

Pull #952

github

web-flow
Merge 8f2551bc9 into 89ea62ebc
Pull Request #952: Specify a limited test suite

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/pythainlp/tokenize/thaisumcut.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-FileCopyrightText: Copyright 2020 Nakhun Chumpolsathien
4
# SPDX-License-Identifier: Apache-2.0
5
"""
6
The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020
7
original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum
8

9
Cite:
10

11
@mastersthesis{chumpolsathien_2020,
12
    title={Using Knowledge Distillation from Keyword Extraction to Improve the Informativeness of Neural Cross-lingual Summarization},
13
    author={Chumpolsathien, Nakhun},
14
    year={2020},
15
    school={Beijing Institute of Technology}
16
"""
17

18
import math
×
NEW
19
import operator
×
NEW
20
import re
×
21
from typing import List
×
22

23
from pythainlp.tokenize import word_tokenize
×
24

25

26
def list_to_string(list: List[str]) -> str:
×
27
    string = "".join(list)
×
28
    string = " ".join(string.split())
×
29
    return string
×
30

31

32
def middle_cut(sentences: List[str]) -> List[str]:
×
33
    new_text = ""
×
34
    for sentence in sentences:
×
35
        sentence_size = len(word_tokenize(sentence, keep_whitespace=False))
×
36

37
        for k in range(0, len(sentence)):
×
38
            if k == 0 or k + 1 >= len(sentence):
×
39
                continue
×
40
            if sentence[k].isdigit() and sentence[k - 1] == " ":
×
41
                sentence = sentence[: k - 1] + sentence[k:]
×
42
            if k + 2 <= len(sentence):
×
43
                if sentence[k].isdigit() and sentence[k + 1] == " ":
×
44
                    sentence = sentence[: k + 1] + sentence[k + 2 :]
×
45

46
        fixed_text_lenth = 20
×
47

48
        if sentence_size > fixed_text_lenth:
×
49
            partition = math.floor(sentence_size / fixed_text_lenth)
×
50
            tokens = word_tokenize(sentence, keep_whitespace=True)
×
51
            for i in range(0, partition):
×
52
                middle_space = sentence_size / (partition + 1) * (i + 1)
×
53
                white_space_index = []
×
54
                white_space_diff = {}
×
55

56
                for j in range(len(tokens)):
×
57
                    if tokens[j] == " ":
×
58
                        white_space_index.append(j)
×
59

60
                for white_space in white_space_index:
×
61
                    white_space_diff.update(
×
62
                        {white_space: abs(white_space - middle_space)}
63
                    )
64

65
                if len(white_space_diff) > 0:
×
66
                    min_diff = min(
×
67
                        white_space_diff.items(), key=operator.itemgetter(1)
68
                    )
69
                    tokens.pop(min_diff[0])
×
70
                    tokens.insert(min_diff[0], "<stop>")
×
71
            new_text = new_text + list_to_string(tokens) + "<stop>"
×
72
        else:
73
            new_text = new_text + sentence + "<stop>"
×
74

75
    sentences = new_text.split("<stop>")
×
76
    sentences = [s.strip() for s in sentences]
×
77
    if "" in sentences:
×
78
        sentences.remove("")
×
79
    if "nan" in sentences:
×
80
        sentences.remove("nan")
×
81

82
    sentences = list(filter(None, sentences))
×
83
    return sentences
×
84

85

86
class ThaiSentenceSegmentor:
×
87
    def split_into_sentences(
×
88
        self, text: str, isMiddleCut: bool = False
89
    ) -> List[str]:
90
        # Declare Variables
91
        th_alphabets = "([ก-๙])"
×
92
        th_conjunction = "(ทำให้|โดย|เพราะ|นอกจากนี้|แต่|กรณีที่|หลังจากนี้|ต่อมา|ภายหลัง|นับตั้งแต่|หลังจาก|ซึ่งเหตุการณ์|ผู้สื่อข่าวรายงานอีก|ส่วนที่|ส่วนสาเหตุ|ฉะนั้น|เพราะฉะนั้น|เพื่อ|เนื่องจาก|จากการสอบสวนทราบว่า|จากกรณี|จากนี้|อย่างไรก็ดี)"
×
93
        th_cite = "(กล่าวว่า|เปิดเผยว่า|รายงานว่า|ให้การว่า|เผยว่า|บนทวิตเตอร์ว่า|แจ้งว่า|พลเมืองดีว่า|อ้างว่า)"
×
94
        th_ka_krub = "(ครับ|ค่ะ)"
×
95
        th_stop_after = "(หรือไม่|โดยเร็ว|แล้ว|อีกด้วย)"
×
96
        th_stop_before = "(ล่าสุด|เบื้องต้น|ซึ่ง|ทั้งนี้|แม้ว่า|เมื่อ|แถมยัง|ตอนนั้น|จนเป็นเหตุให้|จากนั้น|อย่างไรก็ตาม|และก็|อย่างใดก็ตาม|เวลานี้|เช่น|กระทั่ง)"
×
97
        degit = "([0-9])"
×
98
        th_title = "(นาย|นาง|นางสาว|เด็กชาย|เด็กหญิง|น.ส.|ด.ช.|ด.ญ.)"
×
99

100
        text = f" {text} "
×
101
        text = text.replace("\n", " ")
×
102
        text = text.replace("", "")
×
103
        text = text.replace("โดยเร็ว", "<rth_Doeirew>")
×
104
        text = text.replace("เพื่อน", "<rth_friend>")
×
105
        text = text.replace("แต่ง", "<rth_but>")
×
106
        text = text.replace("โดยสาร", "<rth_passenger>")
×
107
        text = text.replace("แล้วแต่", "<rth_leawtea>")
×
108
        text = text.replace("หรือเปล่า", "<rth_repraw>")
×
109
        text = text.replace("หรือไม่", "<rth_remai>")
×
110
        text = text.replace("จึงรุ่งเรืองกิจ", "<rth_tanatorn_lastname>")
×
111
        text = text.replace("ตั้งแต่", "<rth_tangtea>")
×
112
        text = text.replace("แต่ละ", "<rth_teala>")
×
113
        text = text.replace("วิตแล้ว", "<rth_chiwitleaw>")
×
114
        text = text.replace("โดยประ", "<rth_doipra>")
×
115
        text = text.replace("แต่หลังจากนั้น", "<rth_tealangjaknan>")
×
116
        text = text.replace("พรรคเพื่อ", "<for_party>")
×
117
        text = text.replace("แต่เนื่อง", "<rth_teaneung>")
×
118
        text = text.replace("เพื่อทำให้", "เพื่อ<rth_tamhai>")
×
119
        text = text.replace("ทำเพื่อ", "ทำ<rth_for>")
×
120
        text = text.replace("จึงทำให้", "จึง<tamhai>")
×
121
        text = text.replace("มาโดยตลอด", "<madoitalod>")
×
122
        text = text.replace("แต่อย่างใด", "<teayangdaikptam>")
×
123
        text = text.replace("แต่หลังจาก", "แต่<langjak>")
×
124
        text = text.replace("คงทำให้", "<rth_kongtamhai>")
×
125
        text = text.replace("แต่ทั้งนี้", "แต่<tangni>")
×
126
        text = text.replace("มีแต่", "มี<tea>")
×
127
        text = text.replace("เหตุที่ทำให้", "<hedteetamhai>")
×
128
        text = text.replace("โดยหลังจาก", "โดย<langjak>")
×
129
        text = text.replace("ซึ่งหลังจาก", "ซึ่ง<langjak>")
×
130
        text = text.replace("ตั้งโดย", "<rth_tangdoi>")
×
131
        text = text.replace("โดยตรง", "<rth_doitong>")
×
132
        text = text.replace("นั้นหรือ", "<rth_nanhlor>")
×
133
        text = text.replace("ซึ่งต้องทำให้", "ซึ่งต้อง<tamhai>")
×
134
        text = text.replace("ชื่อต่อมา", "ชื่อ<tomar>")
×
135
        text = text.replace("โดยเร่งด่วน", "<doi>เร่งด่วน")
×
136
        text = text.replace("ไม่ได้ทำให้", "ไม่ได้<tamhai>")
×
137
        text = text.replace("จะทำให้", "จะ<tamhai>")
×
138
        text = text.replace("จนทำให้", "จน<tamhai>")
×
139
        text = text.replace("เว้นแต่", "เว้น<rth_tea>")
×
140
        text = text.replace("ก็ทำให้", "ก็<tamhai>")
×
141
        text = text.replace(" ณ ตอนนั้น", " ณ <tonnan>")
×
142
        text = text.replace("บางส่วน", "บาง<rth_suan>")
×
143
        text = text.replace("หรือแม้แต่", "หรือ<rth_meatea>")
×
144
        text = text.replace("โดยทำให้", "โดย<tamhai>")
×
145
        text = text.replace("หรือเพราะ", "หรือ<rth_orbecause>")
×
146
        text = text.replace("มาแต่", "มา<rth_tea>")
×
147
        text = text.replace("แต่ไม่ทำให้", "แต่<maitamhai>")
×
148
        text = text.replace("ฉะนั้นเมื่อ", "ฉะนั้น<rth_moe>")
×
149
        text = text.replace("เพราะฉะนั้น", "เพราะ<rth_chanan>")
×
150
        text = text.replace("เพราะหลังจาก", "เพราะ<rth_langjak>")
×
151
        text = text.replace("สามารถทำให้", "สามารถ<rth_tamhai>")
×
152
        text = text.replace("อาจทำ", "อาจ<rth_tam>")
×
153
        text = text.replace("จะทำ", "จะ<rth_tam>")
×
154
        text = text.replace("และนอกจากนี้", "นอกจากนี้")
×
155
        text = text.replace("อีกทั้งเพื่อ", "อีกทั้ง<rth_for>")
×
156
        text = text.replace("ทั้งนี้เพื่อ", "ทั้งนี้<rth_for>")
×
157
        text = text.replace("เวลาต่อมา", "เวลา<rth_toma>")
×
158
        text = text.replace("อย่างไรก็ตาม", "อย่างไรก็ตาม")
×
159
        text = text.replace(
×
160
            "อย่างไรก็ตามหลังจาก", "<stop>อย่างไรก็ตาม<rth_langjak>"
161
        )
162
        text = text.replace("ซึ่งทำให้", "ซึ่ง<rth_tamhai>")
×
163
        text = text.replace("โดยประมาท", "<doi>ประมาท")
×
164
        text = text.replace("โดยธรรม", "<doi>ธรรม")
×
165
        text = text.replace("โดยสัจจริง", "<doi>สัจจริง")
×
166

167
        if "และ" in text:
×
168
            tokens = word_tokenize(text.strip(), keep_whitespace=True)
×
169
            and_position = -1
×
170
            nearest_space_position = -1
×
171
            last_position = len(tokens)
×
172
            pop_split_position = []
×
173
            split_position = []
×
174
            for i in range(len(tokens)):
×
175
                if tokens[i] == "และ":
×
176
                    and_position = i
×
177

178
                if (
×
179
                    and_position != -1
180
                    and i > and_position
181
                    and tokens[i] == " "
182
                    and nearest_space_position == -1
183
                ):
184
                    if i - and_position != 1:
×
185
                        nearest_space_position = i
×
186

187
                if and_position != -1 and last_position - and_position == 3:
×
188
                    split_position.append(last_position)
×
189
                    and_position = -1
×
190
                    nearest_space_position = -1
×
191

192
                if nearest_space_position != -1:
×
193
                    if nearest_space_position - and_position < 5:
×
194
                        pop_split_position.append(nearest_space_position)
×
195
                    else:
196
                        split_position.append(and_position)
×
197
                    and_position = -1
×
198
                    nearest_space_position = -1
×
199
            for pop in pop_split_position:
×
200
                tokens.pop(pop)
×
201
                tokens.insert(pop, "<stop>")
×
202
            for split in split_position:
×
203
                tokens.insert(split, "<stop>")
×
204
            text = list_to_string(tokens)
×
205

206
        if "หรือ" in text:
×
207
            tokens = word_tokenize(text.strip(), keep_whitespace=True)
×
208
            or_position = -1
×
209
            nearest_space_position = -1
×
210
            last_position = len(tokens)
×
211
            pop_split_position = []
×
212
            split_position = []
×
213
            for i in range(len(tokens)):
×
214
                if tokens[i] == "หรือ":
×
215
                    or_position = i
×
216
                if (
×
217
                    or_position != -1
218
                    and i > or_position
219
                    and tokens[i] == " "
220
                    and nearest_space_position == -1
221
                ):
222
                    if i - or_position != 1:
×
223
                        nearest_space_position = i
×
224

225
                if or_position != -1 and last_position - or_position == 3:
×
226
                    split_position.append(last_position)
×
227
                    or_position = -1
×
228
                    nearest_space_position = -1
×
229

230
                if nearest_space_position != -1:
×
231
                    if nearest_space_position - or_position < 4:
×
232
                        pop_split_position.append(nearest_space_position)
×
233
                    else:
234
                        split_position.append(or_position)
×
235
                    or_position = -1
×
236
                    nearest_space_position = -1
×
237
            for pop in pop_split_position:
×
238
                tokens.pop(pop)
×
239
                tokens.insert(pop, "<stop>")
×
240
            for split in split_position:
×
241
                tokens.insert(split, "<stop>")
×
242
            text = list_to_string(tokens)
×
243

244
        if "จึง" in text:
×
245
            tokens = word_tokenize(text.strip(), keep_whitespace=True)
×
246
            cung_position = -1
×
247
            nearest_space_position = -1
×
248
            pop_split_position = []
×
249
            last_position = len(tokens)
×
250
            split_position = []
×
251
            for i in range(len(tokens)):
×
252
                if tokens[i] == "จึง":
×
253
                    cung_position = i
×
254

255
                if (
×
256
                    cung_position != -1
257
                    and tokens[i] == " "
258
                    and i > cung_position
259
                    and nearest_space_position == -1
260
                ):
261
                    if i - cung_position != 1:
×
262
                        nearest_space_position = i
×
263

264
                if cung_position != -1 and last_position - cung_position == 2:
×
265
                    split_position.append(last_position)
×
266
                    cung_position = -1
×
267
                    nearest_space_position = -1
×
268

269
                if nearest_space_position != -1:
×
270
                    if nearest_space_position - cung_position < 3:
×
271
                        pop_split_position.append(nearest_space_position)
×
272
                    else:
273
                        split_position.append(cung_position)
×
274
                    cung_position = -1
×
275
                    nearest_space_position = -1
×
276

277
            for pop in pop_split_position:
×
278
                tokens.pop(pop)
×
279
                tokens.insert(pop, "<stop>")
×
280
            for split in split_position:
×
281
                tokens.insert(split, "<stop>")
×
282

283
            text = list_to_string(tokens)
×
284

285
        text = re.sub(" " + th_stop_before, "<stop>\\1", text)
×
286
        text = re.sub(th_ka_krub, "\\1<stop>", text)
×
287
        text = re.sub(th_conjunction, "<stop>\\1", text)
×
288
        text = re.sub(th_cite, "\\1<stop>", text)
×
289
        text = re.sub(" " + degit + "[.]" + th_title, "<stop>\\1.\\2", text)
×
290
        text = re.sub(
×
291
            " " + degit + degit + "[.]" + th_title, "<stop>\\1\\2.\\3", text
292
        )
293
        text = re.sub(th_alphabets + th_stop_after + " ", "\\1\\2<stop>", text)
×
294
        if "”" in text:
×
295
            text = text.replace(".”", "”.")
×
296
        if '"' in text:
×
297
            text = text.replace('."', '".')
×
298
        if "!" in text:
×
299
            text = text.replace('!"', '"!')
×
300
        if "?" in text:
×
301
            text = text.replace('?"', '"?')
×
302
        text = text.replace("<rth_Doeirew>", "โดยเร็ว")
×
303
        text = text.replace("<rth_friend>", "เพื่อน")
×
304
        text = text.replace("<rth_but>", "แต่ง")
×
305
        text = text.replace("<rth_passenger>", "โดยสาร")
×
306
        text = text.replace("<rth_leawtea>", "แล้วแต่")
×
307
        text = text.replace("<rth_repraw>", "หรือเปล่า")
×
308
        text = text.replace("<rth_remai>", "หรือไม่")
×
309
        text = text.replace("<rth_tanatorn_lastname>", "จึงรุ่งเรืองกิจ")
×
310
        text = text.replace("<rth_tangtea>", "ตั้งแต่")
×
311
        text = text.replace("<rth_teala>", "แต่ละ")
×
312
        text = text.replace("<rth_chiwitleaw>", "วิตแล้ว")
×
313
        text = text.replace("<rth_doipra>", "โดยประ")
×
314
        text = text.replace("<rth_tealangjaknan>", "แต่หลังจากนั้น")
×
315
        text = text.replace("<for_party>", "พรรคเพื่อ")
×
316
        text = text.replace("<rth_teaneung>", "แต่เนื่อง")
×
317
        text = text.replace("เพื่อ<rth_tamhai>", "เพื่อทำให้")
×
318
        text = text.replace("ทำ<rth_for>", "ทำเพื่อ")
×
319
        text = text.replace("จึง<tamhai>", "จึงทำให้")
×
320
        text = text.replace("<madoitalod>", "มาโดยตลอด")
×
321
        text = text.replace("แต่<langjak>", "แต่หลังจาก")
×
322
        text = text.replace("แต่<tangni>", "แต่ทั้งนี้")
×
323
        text = text.replace("มี<tea>", "มีแต่")
×
324
        text = text.replace("<teayangdaikptam>", "แต่อย่างใด")
×
325
        text = text.replace("<rth_kongtamhai>", "คงทำให้")
×
326
        text = text.replace("<hedteetamhai>", "เหตุที่ทำให้")
×
327
        text = text.replace("โดย<langjak>", "โดยหลังจาก")
×
328
        text = text.replace("ซึ่ง<langjak>", "ซึ่งหลังจาก")
×
329
        text = text.replace("<rth_tangdoi>", "ตั้งโดย")
×
330
        text = text.replace("<rth_doitong>", "โดยตรง")
×
331
        text = text.replace("<rth_nanhlor>", "นั้นหรือ")
×
332
        text = text.replace("ซึ่งต้อง<tamhai>", "ซึ่งต้องทำให้")
×
333
        text = text.replace("ชื่อ<tomar>", "ชื่อต่อมา")
×
334
        text = text.replace("<doi>เร่งด่วน", "โดยเร่งด่วน")
×
335
        text = text.replace("ไม่ได้<tamhai>", "ไม่ได้ทำให้")
×
336
        text = text.replace("จะ<tamhai>", "จะทำให้")
×
337
        text = text.replace("จน<tamhai>", "จนทำให้")
×
338
        text = text.replace("เว้น<rth_tea>", "เว้นแต่")
×
339
        text = text.replace("ก็<tamhai>", "ก็ทำให้")
×
340
        text = text.replace(" ณ <tonnan>", " ณ ตอนนั้น")
×
341
        text = text.replace("บาง<rth_suan>", "บางส่วน")
×
342
        text = text.replace("หรือ<rth_meatea>", "หรือแม้แต่")
×
343
        text = text.replace("โดย<tamhai>", "โดยทำให้")
×
344
        text = text.replace("หรือ<rth_orbecause>", "หรือเพราะ")
×
345
        text = text.replace("มา<rth_tea>", "มาแต่")
×
346
        text = text.replace("แต่<maitamhai>", "แต่ไม่ทำให้")
×
347
        text = text.replace("ฉะนั้น<rth_moe>", "ฉะนั้นเมื่อ")
×
348
        text = text.replace("เพราะ<rth_chanan>", "เพราะฉะนั้น")
×
349
        text = text.replace("เพราะ<rth_langjak>", "เพราะหลังจาก")
×
350
        text = text.replace("สามารถ<rth_tamhai>", "สามารถทำให้")
×
351
        text = text.replace("อาจ<rth_tam>", "อาจทำ")
×
352
        text = text.replace("จะ<rth_tam>", "จะทำ")
×
353
        text = text.replace("อีกทั้ง<rth_for>", "อีกทั้งเพื่อ")
×
354
        text = text.replace("ทั้งนี้<rth_for>", "ทั้งนี้เพื่อ")
×
355
        text = text.replace("เวลา<rth_toma>", "เวลาต่อมา")
×
356
        text = text.replace(
×
357
            "อย่างไรก็ตาม<rth_langjak>",
358
            "อย่างไรก็ตามหลังจาก",
359
        )
360
        text = text.replace("ซึ่ง<rth_tamhai>", "ซึ่งทำให้")
×
361
        text = text.replace("<doi>ประมาท", "โดยประมาท")
×
362
        text = text.replace("<doi>ธรรม", "โดยธรรม")
×
363
        text = text.replace("<doi>สัจจริง", "โดยสัจจริง")
×
364
        text = text.replace("?", "?<stop>")
×
365
        text = text.replace("!", "!<stop>")
×
366
        text = text.replace("<prd>", ".")
×
367
        sentences = text.split("<stop>")
×
368
        sentences = [s.strip() for s in sentences]
×
369
        if "" in sentences:
×
370
            sentences.remove("")
×
371
        if "nan" in sentences:
×
372
            sentences.remove("nan")
×
373

374
        sentences = list(filter(None, sentences))
×
375

376
        if isMiddleCut:
×
377
            return middle_cut(sentences)
×
378
        else:
379
            return sentences
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc